Merge pull request #990 from InfiniTensor/demo131

Demo-131 Cuda graph with optimized paged attention

Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
784139b9 · thatPepe · GitHub · 3c8fb3c0 · 1d6527cb · 784139b9
Unverified Commit 784139b9 authored Feb 13, 2026 by thatPepe Committed by GitHub Feb 13, 2026
20 changed files
--- a/include/infiniop/ops/kv_caching.h
+++ b/include/infiniop/ops/kv_caching.h
+#ifndef __INFINIOP_KV_CACHING_API_H__
+#define __INFINIOP_KV_CACHING_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopKVCachingDescriptor_t;
+__C __export infiniStatus_t infiniopCreateKVCachingDescriptor(
+    infiniopHandle_t handle,
+    infiniopKVCachingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t k_cache,
+    infiniopTensorDescriptor_t v_cache,
+    infiniopTensorDescriptor_t k,
+    infiniopTensorDescriptor_t v,
+    infiniopTensorDescriptor_t past_kv_lengths);
+__C __export infiniStatus_t infiniopGetKVCachingWorkspaceSize(infiniopKVCachingDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopKVCaching(infiniopKVCachingDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *k_cache,
+                                              void *v_cache,
+                                              const void *k,
+                                              const void *v,
+                                              const void *past_kv_lengths,
+                                              void *stream);
+__C __export infiniStatus_t infiniopDestroyKVCachingDescriptor(infiniopKVCachingDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/quant/per_channel_quant_int8.h
+++ b/include/infiniop/ops/quant/per_channel_quant_int8.h
+#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
+#include "../../operator_descriptor.h"
+typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
+__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
+                                                                      infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
+                                                                      infiniopTensorDescriptor_t x_packed_desc,
+                                                                      infiniopTensorDescriptor_t x_scale_desc,
+                                                                      infiniopTensorDescriptor_t x_zero_desc,
+                                                                      infiniopTensorDescriptor_t x_desc);
+__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
+                                                      void *workspace,
+                                                      size_t workspace_size,
+                                                      void *x_packed,
+                                                      void *x_scale,
+                                                      void *x_zero,
+                                                      const void *x,
+                                                      void *stream);
+__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
+#endif
--- a/include/infiniop/ops/silu_and_mul.h
+++ b/include/infiniop/ops/silu_and_mul.h
+#ifndef __INFINIOP_SILU_AND_MUL_API_H__
+#define __INFINIOP_SILU_AND_MUL_API_H__
+#include "../operator_descriptor.h"
+/**
+ * @brief Opaque handle for the SiluAndMul descriptor.
+ */
+typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t;
+/**
+ * @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
+ *
+ * Format: (input_shape, output_shape)
+ * Referencing vLLM kernel SiluAndMul interface:
+ * - input_shape is [..., 2*d]  (last dimension is split into two halves for SiLU and multiplication)
+ * - output_shape is [..., d]   (last dimension reduced to half)
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param output Descriptor for the output tensor. Shape [..., d].
+ * @param input Descriptor for the input tensor. Shape [..., 2*d].
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluAndMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input);
+/**
+ * @brief Queries the workspace size required for SiluAndMul computation.
+ * @param desc The SiluAndMul descriptor.
+ * @param size Pointer to store the required workspace size in bytes.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(
+    infiniopSiluAndMulDescriptor_t desc,
+    size_t *size);
+/**
+ * @brief Executes the SiluAndMul operation.
+ *
+ * Performs SiLU activation on the first half of the last dimension of `input`,
+ * multiplies element-wise with the second half, and stores the result in `output`.
+ *
+ * @param desc The SiluAndMul descriptor.
+ * @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
+ * @param workspace_size Size of the workspace in bytes.
+ * @param output Pointer to the output tensor memory. Shape [..., d].
+ * @param input Pointer to the input tensor memory. Shape [..., 2*d].
+ * @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopSiluAndMul(
+    infiniopSiluAndMulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+/**
+ * @brief Destroys a previously created SiluAndMul descriptor.
+ * @param desc The descriptor to destroy.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor(
+    infiniopSiluAndMulDescriptor_t desc);
+#endif // __INFINIOP_SILU_AND_MUL_API_H__
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
 import contextlib
+with contextlib.suppress(ImportError):
+    from ._preload import preload
+    preload()
 import infinicore.context as context
 import infinicore.nn as nn
@@ -43,8 +48,9 @@ from infinicore.dtype import (
    uint8,
 )
 from infinicore.ops.add import add
-from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
+from infinicore.ops.add_rms_norm import add_rms_norm
 from infinicore.ops.attention import attention
+from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
@@ -115,6 +121,7 @@ __all__ = [
    "add_rms_norm",
    "add_rms_norm_",
    "attention",
+    "kv_caching",
    "matmul",
    "mul",
    "narrow",

--- a/python/infinicore/_preload.py
+++ b/python/infinicore/_preload.py
+import ctypes
+import os
+from typing import Iterable, List
+def _candidate_prefixes(path: str) -> List[str]:
+    """
+    Return HPCC install prefixes to search for libs.
+    Prefer HPCC_PATH; if absent and explicitly opted-in, fall back to /opt/hpcc.
+    """
+    prefixes: List[str] = []
+    if path:
+        prefixes.append(path)
+    seen = set()
+    unique: List[str] = []
+    for p in prefixes:
+        if p and p not in seen:
+            seen.add(p)
+            unique.append(p)
+    return unique
+def _try_load(paths: Iterable[str], name: str) -> bool:
+    """Try to load a shared library from given paths or system search path."""
+    for path in paths:
+        full = os.path.join(path, "lib", name)
+        if os.path.exists(full):
+            try:
+                ctypes.CDLL(full, mode=ctypes.RTLD_GLOBAL)
+                return True
+            except OSError:
+                # Try next candidate
+                continue
+    # Last resort: rely on loader search path
+    try:
+        ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
+        return True
+    except OSError:
+        return False
+def preload_hpcc() -> None:
+    """
+    Best-effort preload of key HPCC runtime libs with RTLD_GLOBAL.
+    This mirrors the behavior of torch's HPCC build that loads libtorch_global_deps.so,
+    but avoids introducing a hard torch dependency. All failures are swallowed.
+    """
+    hpcc_path = os.getenv("HPCC_PATH")
+    if not hpcc_path:
+        return
+    prefixes = _candidate_prefixes(hpcc_path)
+    libs = [
+        "libhcruntime.so",
+        "libhcToolsExt.so",
+        "libruntime_cu.so",
+        "libhccompiler.so",
+    ]
+    for lib in libs:
+        _try_load(prefixes, lib)
+def _should_preload_device(device_type: str) -> bool:
+    """
+    Check if preload is needed for a specific device type.
+    """
+    device_env_map = {
+        "METAX": ["HPCC_PATH", "INFINICORE_PRELOAD_HPCC"],  # HPCC/METAX
+        # Add other device types here as needed:
+        # "ASCEND": ["ASCEND_PATH"],
+        # "CAMBRICON": ["NEUWARE_HOME"],
+    }
+    env_vars = device_env_map.get(device_type, [])
+    for env_var in env_vars:
+        if os.getenv(env_var):
+            return True
+    return False
+def preload_device(device_type: str) -> None:
+    """
+    Preload runtime libraries for a specific device type if needed.
+    Args:
+        device_type: Device type name (e.g., "METAX", "ASCEND", etc.)
+    """
+    if device_type == "METAX":
+        preload_hpcc()
+    # Add other device preload functions here as needed:
+    # elif device_type == "ASCEND":
+    #     preload_ascend()
+    # etc.
+def preload() -> None:
+    """
+    Universal preload function that loops through device types and preloads when required.
+    This function detects available device types and preloads their runtime libraries
+    if the environment indicates they are needed.
+    """
+    # Device types that may require preload
+    device_types = [
+        "METAX",  # HPCC/METAX
+        # Add other device types here as they are implemented:
+        # "ASCEND",
+        # "CAMBRICON",
+        # etc.
+    ]
+    for device_type in device_types:
+        if _should_preload_device(device_type):
+            try:
+                preload_device(device_type)
+            except Exception:
+                # Swallow all errors - preload is best-effort
+                pass
--- a/python/infinicore/device.py
+++ b/python/infinicore/device.py
@@ -82,6 +82,7 @@ _TORCH_DEVICE_MAP = {
    _infinicore.Device.Type.KUNLUN: "cuda",
    _infinicore.Device.Type.HYGON: "cuda",
    _infinicore.Device.Type.QY: "cuda",
+    _infinicore.Device.Type.ALI: "cuda",
 }

--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
 from .causal_softmax import causal_softmax
 from .embedding import embedding
+from .flash_attention import flash_attention
 from .linear import linear
+from .linear_w8a8i8 import linear_w8a8i8
 from .random_sample import random_sample
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
 from .silu import silu
+from .silu_and_mul import silu_and_mul
 from .swiglu import swiglu
 __all__ = [
    "causal_softmax",
+    "embedding",
+    "flash_attention",
+    "linear",
    "random_sample",
    "rms_norm",
+    "RopeAlgo",
+    "rope",
    "silu",
    "swiglu",
-    "linear",
+    "linear_w8a8i8",
-    "embedding",
+    "silu_and_mul",
-    "rope",
-    "RopeAlgo",
 ]
--- a/python/infinicore/nn/functional/embedding.py
+++ b/python/infinicore/nn/functional/embedding.py
@@ -22,9 +22,8 @@ def embedding(
        and (sparse is False)
    ), "Unsupported parameters."
-    assert "cpu" == input.device.type, (
+    # Note: embedding now supports device-side input for graph recording
-        "The device of 'input' variable must be on the CPU."
+    # The C++ implementation handles both CPU and device-side inputs
-    )
    if out is None:
        return Tensor(_infinicore.embedding(input._underlying, weight._underlying))

--- a/python/infinicore/nn/functional/flash_attention.py
+++ b/python/infinicore/nn/functional/flash_attention.py
+import math
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+def flash_attention(
+    query,
+    key,
+    value,
+    total_kv_len,
+    attn_mask=None,
+    dropout_p=0,
+    is_causal=False,
+    scale=None,
+    enable_gqa=False,
+):
+    assert attn_mask is None and dropout_p == 0 and not enable_gqa
+    emb_dim = query.shape[-1]
+    if scale is None:
+        scale = 1 / math.sqrt(emb_dim)
+    return Tensor(
+        _infinicore.flash_attention(
+            query._underlying,
+            key._underlying,
+            value._underlying,
+            total_kv_len._underlying,
+            scale,
+            is_causal,
+        )
+    )
--- a/python/infinicore/nn/functional/linear_w8a8i8.py
+++ b/python/infinicore/nn/functional/linear_w8a8i8.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+def linear_w8a8i8(
+    input: Tensor,
+    weight_packed: Tensor,
+    weight_scale: Tensor,
+    bias=None,
+    out=None,
+) -> Tensor:
+    r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
+    if out is None:
+        return Tensor(
+            _infinicore.linear_w8a8i8(
+                input._underlying,
+                weight_packed._underlying,
+                weight_scale._underlying,
+                None if bias is None else bias._underlying,
+            )
+        )
+    _infinicore.linear_w8a8i8_(
+        out._underlying,
+        input._underlying,
+        weight_packed._underlying,
+        weight_scale._underlying,
+        None if bias is None else bias._underlying,
+    )
+    return out
--- a/python/infinicore/nn/functional/silu_and_mul.py
+++ b/python/infinicore/nn/functional/silu_and_mul.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+def silu_and_mul(input: Tensor, out=None) -> Tensor:
+    r"""Apply the SiLU and Mul (SwiGLU) function.
+    Formula: output = SiLU(input_gate) * input_up
+    Input shape: [..., 2*d], Output shape: [..., d]
+    """
+    if out is None:
+        return Tensor(_infinicore.silu_and_mul(input._underlying))
+    _infinicore.silu_and_mul_(out._underlying, input._underlying)
+    return out
--- a/python/infinicore/ops/add_rms_norm.py
+++ b/python/infinicore/ops/add_rms_norm.py
+import infinicore.tensor as tensor
 from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
+def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None, residual=None):
    """
    Fused Add and RMS Normalization.
@@ -18,30 +18,17 @@ def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
        The add_result can be used as residual for subsequent layers.
    """
    if out is None:
-        result = _infinicore.add_rms_norm(
+        out = tensor.empty(a.shape, dtype=a.dtype, device=a.device)
-            a._underlying, b._underlying, weight._underlying, epsilon
+    if residual is None:
-        )
+        residual = tensor.empty(b.shape, dtype=b.dtype, device=b.device)
-        return (Tensor(result[0]), Tensor(result[1]))
-    y, residual_out = out
    _infinicore.add_rms_norm_(
-        y._underlying,
+        out._underlying,
-        residual_out._underlying,
+        residual._underlying,
        a._underlying,
        b._underlying,
        weight._underlying,
        epsilon,
    )
-    return (y, residual_out)
+    return out, residual
-def add_rms_norm_(y, residual_out, a, b, weight, epsilon=1e-5):
-    """In-place Fused Add and RMS Normalization."""
-    _infinicore.add_rms_norm_(
-        y._underlying,
-        residual_out._underlying,
-        a._underlying,
-        b._underlying,
-        weight._underlying,
-        epsilon,
-    )
--- a/python/infinicore/ops/kv_caching.py
+++ b/python/infinicore/ops/kv_caching.py
+from infinicore.lib import _infinicore
+def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    _infinicore.kv_caching_(
+        k_cache._underlying,
+        v_cache._underlying,
+        k._underlying,
+        v._underlying,
+        past_kv_lengths._underlying,
+    )
+    return k_cache, v_cache
--- a/scripts/build_ntops.py
+++ b/scripts/build_ntops.py
+import concurrent.futures
 import importlib
 import pathlib
@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src"
 def _find_and_build_ops():
    ops_path = SRC_DIR_PATH / "infiniop" / "ops"
-    for op_dir in ops_path.iterdir():
+    with concurrent.futures.ProcessPoolExecutor() as executor:
-        ninetoothed_path = op_dir / "ninetoothed"
+        futures = []
-        if ninetoothed_path.is_dir():
+        for op_dir in ops_path.iterdir():
-            module_path = ninetoothed_path / "build"
+            ninetoothed_path = op_dir / "ninetoothed"
-            relative_path = module_path.relative_to(SRC_DIR_PATH)
-            import_name = ".".join(relative_path.parts)
-            module = importlib.import_module(import_name)
-            module.build()
+            if not ninetoothed_path.is_dir():
+                continue
+            build_file = ninetoothed_path / "build.py"
+            if not build_file.exists():
+                continue
+            futures.append(executor.submit(_build, ninetoothed_path))
+        for future in concurrent.futures.as_completed(futures):
+            future.result()
+def _build(ninetoothed_path):
+    module_path = ninetoothed_path / "build"
+    relative_path = module_path.relative_to(SRC_DIR_PATH)
+    import_name = ".".join(relative_path.parts)
+    module = importlib.import_module(import_name)
+    module.build()
 if __name__ == "__main__":

--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -39,6 +39,9 @@ def run_tests(args):
        "topkrouter.py",
        "topksoftmax.py",
        "zeros.py",
+        # "paged_attention.py",
+        # "paged_caching.py",
+        # "paged_attention_prefill.py"
    ]:
        result = subprocess.run(
            f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True

--- a/src/infiniccl-test/main.cpp
+++ b/src/infiniccl-test/main.cpp
@@ -12,7 +12,7 @@ void printUsage() {
    std::cout << "infiniccl-test --<device>" << std::endl
              << std::endl;
    std::cout << "  --<device>" << std::endl;
-    std::cout << "    Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon)." << std::endl
+    std::cout << "    Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon|ali)." << std::endl
              << std::endl;
    std::cout << "The program will run tests on all visible devices of the specified device type."
              << " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
        else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
        else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
        else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
+        else PARSE_DEVICE("--ali", INFINI_DEVICE_ALI)
        else {
            printUsage();
        }

--- a/src/infiniccl/cambricon/infiniccl_cambricon.cc
+++ b/src/infiniccl/cambricon/infiniccl_cambricon.cc
@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(
    for (int i = 0; i < ndevice; i++) {
        rank_list[i] = i;
-        CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS);
+        CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), cnrtSuccess);
    }
    CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,

--- a/src/infiniccl/cuda/infiniccl_cuda.h
+++ b/src/infiniccl/cuda/infiniccl_cuda.h
@@ -4,7 +4,7 @@
 #include "../infiniccl_impl.h"
 // Windows does not support CUDA
-#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
+#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
 INFINICCL_DEVICE_API_IMPL(cuda)
 #else
 INFINICCL_DEVICE_API_NOOP(cuda)

--- a/src/infiniccl/infiniccl.cc
+++ b/src/infiniccl/infiniccl.cc
@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll(
        COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
        COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
        COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
+        COMM_INIT_ALL(INFINI_DEVICE_ALI, cuda);
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
        COMM_DESTROY(INFINI_DEVICE_METAX, metax);
        COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
        COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
+        COMM_DESTROY(INFINI_DEVICE_ALI, cuda);
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce(
        ALL_REDUCE(INFINI_DEVICE_METAX, metax);
        ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
        ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
+        ALL_REDUCE(INFINI_DEVICE_ALI, cuda);
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infinicore-test/README.md
+++ b/src/infinicore-test/README.md
@@ -66,6 +66,7 @@ xmake build infinicore-test
 ./infinicore-test --qy
 ./infinicore-test --kunlun
 ./infinicore-test --hygon
+./infinicore-test --ali
 ```
 ### Customize Test Parameters