Unverified Commit 784139b9 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #990 from InfiniTensor/demo131

Demo-131 Cuda graph with optimized paged attention
parents 3c8fb3c0 1d6527cb
#ifndef __INFINIOP_KV_CACHING_API_H__
#define __INFINIOP_KV_CACHING_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopKVCachingDescriptor_t;
__C __export infiniStatus_t infiniopCreateKVCachingDescriptor(
infiniopHandle_t handle,
infiniopKVCachingDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t k_cache,
infiniopTensorDescriptor_t v_cache,
infiniopTensorDescriptor_t k,
infiniopTensorDescriptor_t v,
infiniopTensorDescriptor_t past_kv_lengths);
__C __export infiniStatus_t infiniopGetKVCachingWorkspaceSize(infiniopKVCachingDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopKVCaching(infiniopKVCachingDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *k_cache,
void *v_cache,
const void *k,
const void *v,
const void *past_kv_lengths,
void *stream);
__C __export infiniStatus_t infiniopDestroyKVCachingDescriptor(infiniopKVCachingDescriptor_t desc);
#endif
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#include "../../operator_descriptor.h"
typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;
__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);
__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);
#endif
#ifndef __INFINIOP_SILU_AND_MUL_API_H__
#define __INFINIOP_SILU_AND_MUL_API_H__
#include "../operator_descriptor.h"
/**
* @brief Opaque handle for the SiluAndMul descriptor.
*/
typedef struct InfiniopDescriptor *infiniopSiluAndMulDescriptor_t;
/**
* @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
*
* Format: (input_shape, output_shape)
* Referencing vLLM kernel SiluAndMul interface:
* - input_shape is [..., 2*d] (last dimension is split into two halves for SiLU and multiplication)
* - output_shape is [..., d] (last dimension reduced to half)
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param output Descriptor for the output tensor. Shape [..., d].
* @param input Descriptor for the input tensor. Shape [..., 2*d].
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopCreateSiluAndMulDescriptor(
infiniopHandle_t handle,
infiniopSiluAndMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t input);
/**
* @brief Queries the workspace size required for SiluAndMul computation.
* @param desc The SiluAndMul descriptor.
* @param size Pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopGetSiluAndMulWorkspaceSize(
infiniopSiluAndMulDescriptor_t desc,
size_t *size);
/**
* @brief Executes the SiluAndMul operation.
*
* Performs SiLU activation on the first half of the last dimension of `input`,
* multiplies element-wise with the second half, and stores the result in `output`.
*
* @param desc The SiluAndMul descriptor.
* @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
* @param workspace_size Size of the workspace in bytes.
* @param output Pointer to the output tensor memory. Shape [..., d].
* @param input Pointer to the input tensor memory. Shape [..., 2*d].
* @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopSiluAndMul(
infiniopSiluAndMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream);
/**
* @brief Destroys a previously created SiluAndMul descriptor.
* @param desc The descriptor to destroy.
* @return infiniStatus_t Status code of the operation.
*/
__C __export infiniStatus_t infiniopDestroySiluAndMulDescriptor(
infiniopSiluAndMulDescriptor_t desc);
#endif // __INFINIOP_SILU_AND_MUL_API_H__
import contextlib import contextlib
with contextlib.suppress(ImportError):
from ._preload import preload
preload()
import infinicore.context as context import infinicore.context as context
import infinicore.nn as nn import infinicore.nn as nn
...@@ -43,8 +48,9 @@ from infinicore.dtype import ( ...@@ -43,8 +48,9 @@ from infinicore.dtype import (
uint8, uint8,
) )
from infinicore.ops.add import add from infinicore.ops.add import add
from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_ from infinicore.ops.add_rms_norm import add_rms_norm
from infinicore.ops.attention import attention from infinicore.ops.attention import attention
from infinicore.ops.kv_caching import kv_caching
from infinicore.ops.matmul import matmul from infinicore.ops.matmul import matmul
from infinicore.ops.mul import mul from infinicore.ops.mul import mul
from infinicore.ops.narrow import narrow from infinicore.ops.narrow import narrow
...@@ -115,6 +121,7 @@ __all__ = [ ...@@ -115,6 +121,7 @@ __all__ = [
"add_rms_norm", "add_rms_norm",
"add_rms_norm_", "add_rms_norm_",
"attention", "attention",
"kv_caching",
"matmul", "matmul",
"mul", "mul",
"narrow", "narrow",
......
import ctypes
import os
from typing import Iterable, List
def _candidate_prefixes(path: str) -> List[str]:
"""
Return HPCC install prefixes to search for libs.
Prefer HPCC_PATH; if absent and explicitly opted-in, fall back to /opt/hpcc.
"""
prefixes: List[str] = []
if path:
prefixes.append(path)
seen = set()
unique: List[str] = []
for p in prefixes:
if p and p not in seen:
seen.add(p)
unique.append(p)
return unique
def _try_load(paths: Iterable[str], name: str) -> bool:
"""Try to load a shared library from given paths or system search path."""
for path in paths:
full = os.path.join(path, "lib", name)
if os.path.exists(full):
try:
ctypes.CDLL(full, mode=ctypes.RTLD_GLOBAL)
return True
except OSError:
# Try next candidate
continue
# Last resort: rely on loader search path
try:
ctypes.CDLL(name, mode=ctypes.RTLD_GLOBAL)
return True
except OSError:
return False
def preload_hpcc() -> None:
"""
Best-effort preload of key HPCC runtime libs with RTLD_GLOBAL.
This mirrors the behavior of torch's HPCC build that loads libtorch_global_deps.so,
but avoids introducing a hard torch dependency. All failures are swallowed.
"""
hpcc_path = os.getenv("HPCC_PATH")
if not hpcc_path:
return
prefixes = _candidate_prefixes(hpcc_path)
libs = [
"libhcruntime.so",
"libhcToolsExt.so",
"libruntime_cu.so",
"libhccompiler.so",
]
for lib in libs:
_try_load(prefixes, lib)
def _should_preload_device(device_type: str) -> bool:
"""
Check if preload is needed for a specific device type.
"""
device_env_map = {
"METAX": ["HPCC_PATH", "INFINICORE_PRELOAD_HPCC"], # HPCC/METAX
# Add other device types here as needed:
# "ASCEND": ["ASCEND_PATH"],
# "CAMBRICON": ["NEUWARE_HOME"],
}
env_vars = device_env_map.get(device_type, [])
for env_var in env_vars:
if os.getenv(env_var):
return True
return False
def preload_device(device_type: str) -> None:
"""
Preload runtime libraries for a specific device type if needed.
Args:
device_type: Device type name (e.g., "METAX", "ASCEND", etc.)
"""
if device_type == "METAX":
preload_hpcc()
# Add other device preload functions here as needed:
# elif device_type == "ASCEND":
# preload_ascend()
# etc.
def preload() -> None:
"""
Universal preload function that loops through device types and preloads when required.
This function detects available device types and preloads their runtime libraries
if the environment indicates they are needed.
"""
# Device types that may require preload
device_types = [
"METAX", # HPCC/METAX
# Add other device types here as they are implemented:
# "ASCEND",
# "CAMBRICON",
# etc.
]
for device_type in device_types:
if _should_preload_device(device_type):
try:
preload_device(device_type)
except Exception:
# Swallow all errors - preload is best-effort
pass
...@@ -82,6 +82,7 @@ _TORCH_DEVICE_MAP = { ...@@ -82,6 +82,7 @@ _TORCH_DEVICE_MAP = {
_infinicore.Device.Type.KUNLUN: "cuda", _infinicore.Device.Type.KUNLUN: "cuda",
_infinicore.Device.Type.HYGON: "cuda", _infinicore.Device.Type.HYGON: "cuda",
_infinicore.Device.Type.QY: "cuda", _infinicore.Device.Type.QY: "cuda",
_infinicore.Device.Type.ALI: "cuda",
} }
......
from .causal_softmax import causal_softmax from .causal_softmax import causal_softmax
from .embedding import embedding from .embedding import embedding
from .flash_attention import flash_attention
from .linear import linear from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample from .random_sample import random_sample
from .rms_norm import rms_norm from .rms_norm import rms_norm
from .rope import RopeAlgo, rope from .rope import RopeAlgo, rope
from .silu import silu from .silu import silu
from .silu_and_mul import silu_and_mul
from .swiglu import swiglu from .swiglu import swiglu
__all__ = [ __all__ = [
"causal_softmax", "causal_softmax",
"embedding",
"flash_attention",
"linear",
"random_sample", "random_sample",
"rms_norm", "rms_norm",
"RopeAlgo",
"rope",
"silu", "silu",
"swiglu", "swiglu",
"linear", "linear_w8a8i8",
"embedding", "silu_and_mul",
"rope",
"RopeAlgo",
] ]
...@@ -22,9 +22,8 @@ def embedding( ...@@ -22,9 +22,8 @@ def embedding(
and (sparse is False) and (sparse is False)
), "Unsupported parameters." ), "Unsupported parameters."
assert "cpu" == input.device.type, ( # Note: embedding now supports device-side input for graph recording
"The device of 'input' variable must be on the CPU." # The C++ implementation handles both CPU and device-side inputs
)
if out is None: if out is None:
return Tensor(_infinicore.embedding(input._underlying, weight._underlying)) return Tensor(_infinicore.embedding(input._underlying, weight._underlying))
......
import math
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def flash_attention(
query,
key,
value,
total_kv_len,
attn_mask=None,
dropout_p=0,
is_causal=False,
scale=None,
enable_gqa=False,
):
assert attn_mask is None and dropout_p == 0 and not enable_gqa
emb_dim = query.shape[-1]
if scale is None:
scale = 1 / math.sqrt(emb_dim)
return Tensor(
_infinicore.flash_attention(
query._underlying,
key._underlying,
value._underlying,
total_kv_len._underlying,
scale,
is_causal,
)
)
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def linear_w8a8i8(
input: Tensor,
weight_packed: Tensor,
weight_scale: Tensor,
bias=None,
out=None,
) -> Tensor:
r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
if out is None:
return Tensor(
_infinicore.linear_w8a8i8(
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
)
_infinicore.linear_w8a8i8_(
out._underlying,
input._underlying,
weight_packed._underlying,
weight_scale._underlying,
None if bias is None else bias._underlying,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def silu_and_mul(input: Tensor, out=None) -> Tensor:
r"""Apply the SiLU and Mul (SwiGLU) function.
Formula: output = SiLU(input_gate) * input_up
Input shape: [..., 2*d], Output shape: [..., d]
"""
if out is None:
return Tensor(_infinicore.silu_and_mul(input._underlying))
_infinicore.silu_and_mul_(out._underlying, input._underlying)
return out
import infinicore.tensor as tensor
from infinicore.lib import _infinicore from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None): def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None, residual=None):
""" """
Fused Add and RMS Normalization. Fused Add and RMS Normalization.
...@@ -18,30 +18,17 @@ def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None): ...@@ -18,30 +18,17 @@ def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
The add_result can be used as residual for subsequent layers. The add_result can be used as residual for subsequent layers.
""" """
if out is None: if out is None:
result = _infinicore.add_rms_norm( out = tensor.empty(a.shape, dtype=a.dtype, device=a.device)
a._underlying, b._underlying, weight._underlying, epsilon if residual is None:
) residual = tensor.empty(b.shape, dtype=b.dtype, device=b.device)
return (Tensor(result[0]), Tensor(result[1]))
y, residual_out = out
_infinicore.add_rms_norm_( _infinicore.add_rms_norm_(
y._underlying, out._underlying,
residual_out._underlying, residual._underlying,
a._underlying, a._underlying,
b._underlying, b._underlying,
weight._underlying, weight._underlying,
epsilon, epsilon,
) )
return (y, residual_out)
return out, residual
def add_rms_norm_(y, residual_out, a, b, weight, epsilon=1e-5):
"""In-place Fused Add and RMS Normalization."""
_infinicore.add_rms_norm_(
y._underlying,
residual_out._underlying,
a._underlying,
b._underlying,
weight._underlying,
epsilon,
)
from infinicore.lib import _infinicore
def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
_infinicore.kv_caching_(
k_cache._underlying,
v_cache._underlying,
k._underlying,
v._underlying,
past_kv_lengths._underlying,
)
return k_cache, v_cache
import concurrent.futures
import importlib import importlib
import pathlib import pathlib
...@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src" ...@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src"
def _find_and_build_ops(): def _find_and_build_ops():
ops_path = SRC_DIR_PATH / "infiniop" / "ops" ops_path = SRC_DIR_PATH / "infiniop" / "ops"
for op_dir in ops_path.iterdir(): with concurrent.futures.ProcessPoolExecutor() as executor:
ninetoothed_path = op_dir / "ninetoothed" futures = []
if ninetoothed_path.is_dir(): for op_dir in ops_path.iterdir():
module_path = ninetoothed_path / "build" ninetoothed_path = op_dir / "ninetoothed"
relative_path = module_path.relative_to(SRC_DIR_PATH)
import_name = ".".join(relative_path.parts)
module = importlib.import_module(import_name)
module.build() if not ninetoothed_path.is_dir():
continue
build_file = ninetoothed_path / "build.py"
if not build_file.exists():
continue
futures.append(executor.submit(_build, ninetoothed_path))
for future in concurrent.futures.as_completed(futures):
future.result()
def _build(ninetoothed_path):
module_path = ninetoothed_path / "build"
relative_path = module_path.relative_to(SRC_DIR_PATH)
import_name = ".".join(relative_path.parts)
module = importlib.import_module(import_name)
module.build()
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -39,6 +39,9 @@ def run_tests(args): ...@@ -39,6 +39,9 @@ def run_tests(args):
"topkrouter.py", "topkrouter.py",
"topksoftmax.py", "topksoftmax.py",
"zeros.py", "zeros.py",
# "paged_attention.py",
# "paged_caching.py",
# "paged_attention_prefill.py"
]: ]:
result = subprocess.run( result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
...@@ -12,7 +12,7 @@ void printUsage() { ...@@ -12,7 +12,7 @@ void printUsage() {
std::cout << "infiniccl-test --<device>" << std::endl std::cout << "infiniccl-test --<device>" << std::endl
<< std::endl; << std::endl;
std::cout << " --<device>" << std::endl; std::cout << " --<device>" << std::endl;
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon)." << std::endl std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon|ali)." << std::endl
<< std::endl; << std::endl;
std::cout << "The program will run tests on all visible devices of the specified device type." std::cout << "The program will run tests on all visible devices of the specified device type."
<< " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs."; << " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
...@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) { ...@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
else PARSE_DEVICE("--qy", INFINI_DEVICE_QY) else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN) else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON) else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
else PARSE_DEVICE("--ali", INFINI_DEVICE_ALI)
else { else {
printUsage(); printUsage();
} }
......
...@@ -62,7 +62,7 @@ infiniStatus_t commInitAll( ...@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(
for (int i = 0; i < ndevice; i++) { for (int i = 0; i < ndevice; i++) {
rank_list[i] = i; rank_list[i] = i;
CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS); CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), cnrtSuccess);
} }
CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice, CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "../infiniccl_impl.h" #include "../infiniccl_impl.h"
// Windows does not support CUDA // Windows does not support CUDA
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32) #if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL(cuda) INFINICCL_DEVICE_API_IMPL(cuda)
#else #else
INFINICCL_DEVICE_API_NOOP(cuda) INFINICCL_DEVICE_API_NOOP(cuda)
......
...@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll( ...@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll(
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax); COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore); COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun); COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
COMM_INIT_ALL(INFINI_DEVICE_ALI, cuda);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) { ...@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
COMM_DESTROY(INFINI_DEVICE_METAX, metax); COMM_DESTROY(INFINI_DEVICE_METAX, metax);
COMM_DESTROY(INFINI_DEVICE_MOORE, moore); COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun); COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
COMM_DESTROY(INFINI_DEVICE_ALI, cuda);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce( ...@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce(
ALL_REDUCE(INFINI_DEVICE_METAX, metax); ALL_REDUCE(INFINI_DEVICE_METAX, metax);
ALL_REDUCE(INFINI_DEVICE_MOORE, moore); ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun); ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
ALL_REDUCE(INFINI_DEVICE_ALI, cuda);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
...@@ -66,6 +66,7 @@ xmake build infinicore-test ...@@ -66,6 +66,7 @@ xmake build infinicore-test
./infinicore-test --qy ./infinicore-test --qy
./infinicore-test --kunlun ./infinicore-test --kunlun
./infinicore-test --hygon ./infinicore-test --hygon
./infinicore-test --ali
``` ```
### Customize Test Parameters ### Customize Test Parameters
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment