Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
784139b9
Unverified
Commit
784139b9
authored
Feb 13, 2026
by
thatPepe
Committed by
GitHub
Feb 13, 2026
Browse files
Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
parents
3c8fb3c0
1d6527cb
Changes
582
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
411 additions
and
40 deletions
+411
-40
include/infiniop/ops/kv_caching.h
include/infiniop/ops/kv_caching.h
+31
-0
include/infiniop/ops/quant/per_channel_quant_int8.h
include/infiniop/ops/quant/per_channel_quant_int8.h
+28
-0
include/infiniop/ops/silu_and_mul.h
include/infiniop/ops/silu_and_mul.h
+71
-0
python/infinicore/__init__.py
python/infinicore/__init__.py
+8
-1
python/infinicore/_preload.py
python/infinicore/_preload.py
+121
-0
python/infinicore/device.py
python/infinicore/device.py
+1
-0
python/infinicore/nn/functional/__init__.py
python/infinicore/nn/functional/__init__.py
+10
-4
python/infinicore/nn/functional/embedding.py
python/infinicore/nn/functional/embedding.py
+2
-3
python/infinicore/nn/functional/flash_attention.py
python/infinicore/nn/functional/flash_attention.py
+34
-0
python/infinicore/nn/functional/linear_w8a8i8.py
python/infinicore/nn/functional/linear_w8a8i8.py
+31
-0
python/infinicore/nn/functional/silu_and_mul.py
python/infinicore/nn/functional/silu_and_mul.py
+17
-0
python/infinicore/ops/add_rms_norm.py
python/infinicore/ops/add_rms_norm.py
+8
-21
python/infinicore/ops/kv_caching.py
python/infinicore/ops/kv_caching.py
+13
-0
scripts/build_ntops.py
scripts/build_ntops.py
+25
-8
scripts/python_test.py
scripts/python_test.py
+3
-0
src/infiniccl-test/main.cpp
src/infiniccl-test/main.cpp
+2
-1
src/infiniccl/cambricon/infiniccl_cambricon.cc
src/infiniccl/cambricon/infiniccl_cambricon.cc
+1
-1
src/infiniccl/cuda/infiniccl_cuda.h
src/infiniccl/cuda/infiniccl_cuda.h
+1
-1
src/infiniccl/infiniccl.cc
src/infiniccl/infiniccl.cc
+3
-0
src/infinicore-test/README.md
src/infinicore-test/README.md
+1
-0
No files found.
include/infiniop/ops/kv_caching.h
0 → 100644
View file @
784139b9
#ifndef __INFINIOP_KV_CACHING_API_H__
#define __INFINIOP_KV_CACHING_API_H__
#include "../operator_descriptor.h"
typedef
struct
InfiniopDescriptor
*
infiniopKVCachingDescriptor_t
;
__C
__export
infiniStatus_t
infiniopCreateKVCachingDescriptor
(
infiniopHandle_t
handle
,
infiniopKVCachingDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
k_cache
,
infiniopTensorDescriptor_t
v_cache
,
infiniopTensorDescriptor_t
k
,
infiniopTensorDescriptor_t
v
,
infiniopTensorDescriptor_t
past_kv_lengths
);
__C
__export
infiniStatus_t
infiniopGetKVCachingWorkspaceSize
(
infiniopKVCachingDescriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopKVCaching
(
infiniopKVCachingDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
k_cache
,
void
*
v_cache
,
const
void
*
k
,
const
void
*
v
,
const
void
*
past_kv_lengths
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyKVCachingDescriptor
(
infiniopKVCachingDescriptor_t
desc
);
#endif
include/infiniop/ops/quant/per_channel_quant_int8.h
0 → 100644
View file @
784139b9
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#include "../../operator_descriptor.h"
typedef
InfiniopDescriptor
*
infiniopPerChannelQuantI8Descriptor_t
;
__C
__export
infiniStatus_t
infiniopCreatePerChannelQuantI8Descriptor
(
infiniopHandle_t
handle
,
infiniopPerChannelQuantI8Descriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
x_packed_desc
,
infiniopTensorDescriptor_t
x_scale_desc
,
infiniopTensorDescriptor_t
x_zero_desc
,
infiniopTensorDescriptor_t
x_desc
);
__C
__export
infiniStatus_t
infiniopGetPerChannelQuantI8WorkspaceSize
(
infiniopPerChannelQuantI8Descriptor_t
desc
,
size_t
*
size
);
__C
__export
infiniStatus_t
infiniopPerChannelQuantI8
(
infiniopPerChannelQuantI8Descriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
x_packed
,
void
*
x_scale
,
void
*
x_zero
,
const
void
*
x
,
void
*
stream
);
__C
__export
infiniStatus_t
infiniopDestroyPerChannelQuantI8Descriptor
(
infiniopPerChannelQuantI8Descriptor_t
desc
);
#endif
include/infiniop/ops/silu_and_mul.h
0 → 100644
View file @
784139b9
#ifndef __INFINIOP_SILU_AND_MUL_API_H__
#define __INFINIOP_SILU_AND_MUL_API_H__
#include "../operator_descriptor.h"
/**
* @brief Opaque handle for the SiluAndMul descriptor.
*/
typedef
struct
InfiniopDescriptor
*
infiniopSiluAndMulDescriptor_t
;
/**
* @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
*
* Format: (input_shape, output_shape)
* Referencing vLLM kernel SiluAndMul interface:
* - input_shape is [..., 2*d] (last dimension is split into two halves for SiLU and multiplication)
* - output_shape is [..., d] (last dimension reduced to half)
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param output Descriptor for the output tensor. Shape [..., d].
* @param input Descriptor for the input tensor. Shape [..., 2*d].
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopCreateSiluAndMulDescriptor
(
infiniopHandle_t
handle
,
infiniopSiluAndMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output
,
infiniopTensorDescriptor_t
input
);
/**
* @brief Queries the workspace size required for SiluAndMul computation.
* @param desc The SiluAndMul descriptor.
* @param size Pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopGetSiluAndMulWorkspaceSize
(
infiniopSiluAndMulDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the SiluAndMul operation.
*
* Performs SiLU activation on the first half of the last dimension of `input`,
* multiplies element-wise with the second half, and stores the result in `output`.
*
* @param desc The SiluAndMul descriptor.
* @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
* @param workspace_size Size of the workspace in bytes.
* @param output Pointer to the output tensor memory. Shape [..., d].
* @param input Pointer to the input tensor memory. Shape [..., 2*d].
* @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopSiluAndMul
(
infiniopSiluAndMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
);
/**
* @brief Destroys a previously created SiluAndMul descriptor.
* @param desc The descriptor to destroy.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopDestroySiluAndMulDescriptor
(
infiniopSiluAndMulDescriptor_t
desc
);
#endif // __INFINIOP_SILU_AND_MUL_API_H__
python/infinicore/__init__.py
View file @
784139b9
import
contextlib
with
contextlib
.
suppress
(
ImportError
):
from
._preload
import
preload
preload
()
import
infinicore.context
as
context
import
infinicore.nn
as
nn
...
...
@@ -43,8 +48,9 @@ from infinicore.dtype import (
uint8
,
)
from
infinicore.ops.add
import
add
from
infinicore.ops.add_rms_norm
import
add_rms_norm
,
add_rms_norm_
from
infinicore.ops.add_rms_norm
import
add_rms_norm
from
infinicore.ops.attention
import
attention
from
infinicore.ops.kv_caching
import
kv_caching
from
infinicore.ops.matmul
import
matmul
from
infinicore.ops.mul
import
mul
from
infinicore.ops.narrow
import
narrow
...
...
@@ -115,6 +121,7 @@ __all__ = [
"add_rms_norm"
,
"add_rms_norm_"
,
"attention"
,
"kv_caching"
,
"matmul"
,
"mul"
,
"narrow"
,
...
...
python/infinicore/_preload.py
0 → 100644
View file @
784139b9
import
ctypes
import
os
from
typing
import
Iterable
,
List
def
_candidate_prefixes
(
path
:
str
)
->
List
[
str
]:
"""
Return HPCC install prefixes to search for libs.
Prefer HPCC_PATH; if absent and explicitly opted-in, fall back to /opt/hpcc.
"""
prefixes
:
List
[
str
]
=
[]
if
path
:
prefixes
.
append
(
path
)
seen
=
set
()
unique
:
List
[
str
]
=
[]
for
p
in
prefixes
:
if
p
and
p
not
in
seen
:
seen
.
add
(
p
)
unique
.
append
(
p
)
return
unique
def
_try_load
(
paths
:
Iterable
[
str
],
name
:
str
)
->
bool
:
"""Try to load a shared library from given paths or system search path."""
for
path
in
paths
:
full
=
os
.
path
.
join
(
path
,
"lib"
,
name
)
if
os
.
path
.
exists
(
full
):
try
:
ctypes
.
CDLL
(
full
,
mode
=
ctypes
.
RTLD_GLOBAL
)
return
True
except
OSError
:
# Try next candidate
continue
# Last resort: rely on loader search path
try
:
ctypes
.
CDLL
(
name
,
mode
=
ctypes
.
RTLD_GLOBAL
)
return
True
except
OSError
:
return
False
def
preload_hpcc
()
->
None
:
"""
Best-effort preload of key HPCC runtime libs with RTLD_GLOBAL.
This mirrors the behavior of torch's HPCC build that loads libtorch_global_deps.so,
but avoids introducing a hard torch dependency. All failures are swallowed.
"""
hpcc_path
=
os
.
getenv
(
"HPCC_PATH"
)
if
not
hpcc_path
:
return
prefixes
=
_candidate_prefixes
(
hpcc_path
)
libs
=
[
"libhcruntime.so"
,
"libhcToolsExt.so"
,
"libruntime_cu.so"
,
"libhccompiler.so"
,
]
for
lib
in
libs
:
_try_load
(
prefixes
,
lib
)
def
_should_preload_device
(
device_type
:
str
)
->
bool
:
"""
Check if preload is needed for a specific device type.
"""
device_env_map
=
{
"METAX"
:
[
"HPCC_PATH"
,
"INFINICORE_PRELOAD_HPCC"
],
# HPCC/METAX
# Add other device types here as needed:
# "ASCEND": ["ASCEND_PATH"],
# "CAMBRICON": ["NEUWARE_HOME"],
}
env_vars
=
device_env_map
.
get
(
device_type
,
[])
for
env_var
in
env_vars
:
if
os
.
getenv
(
env_var
):
return
True
return
False
def
preload_device
(
device_type
:
str
)
->
None
:
"""
Preload runtime libraries for a specific device type if needed.
Args:
device_type: Device type name (e.g., "METAX", "ASCEND", etc.)
"""
if
device_type
==
"METAX"
:
preload_hpcc
()
# Add other device preload functions here as needed:
# elif device_type == "ASCEND":
# preload_ascend()
# etc.
def
preload
()
->
None
:
"""
Universal preload function that loops through device types and preloads when required.
This function detects available device types and preloads their runtime libraries
if the environment indicates they are needed.
"""
# Device types that may require preload
device_types
=
[
"METAX"
,
# HPCC/METAX
# Add other device types here as they are implemented:
# "ASCEND",
# "CAMBRICON",
# etc.
]
for
device_type
in
device_types
:
if
_should_preload_device
(
device_type
):
try
:
preload_device
(
device_type
)
except
Exception
:
# Swallow all errors - preload is best-effort
pass
python/infinicore/device.py
View file @
784139b9
...
...
@@ -82,6 +82,7 @@ _TORCH_DEVICE_MAP = {
_infinicore
.
Device
.
Type
.
KUNLUN
:
"cuda"
,
_infinicore
.
Device
.
Type
.
HYGON
:
"cuda"
,
_infinicore
.
Device
.
Type
.
QY
:
"cuda"
,
_infinicore
.
Device
.
Type
.
ALI
:
"cuda"
,
}
...
...
python/infinicore/nn/functional/__init__.py
View file @
784139b9
from
.causal_softmax
import
causal_softmax
from
.embedding
import
embedding
from
.flash_attention
import
flash_attention
from
.linear
import
linear
from
.linear_w8a8i8
import
linear_w8a8i8
from
.random_sample
import
random_sample
from
.rms_norm
import
rms_norm
from
.rope
import
RopeAlgo
,
rope
from
.silu
import
silu
from
.silu_and_mul
import
silu_and_mul
from
.swiglu
import
swiglu
__all__
=
[
"causal_softmax"
,
"embedding"
,
"flash_attention"
,
"linear"
,
"random_sample"
,
"rms_norm"
,
"RopeAlgo"
,
"rope"
,
"silu"
,
"swiglu"
,
"linear"
,
"embedding"
,
"rope"
,
"RopeAlgo"
,
"linear_w8a8i8"
,
"silu_and_mul"
,
]
python/infinicore/nn/functional/embedding.py
View file @
784139b9
...
...
@@ -22,9 +22,8 @@ def embedding(
and
(
sparse
is
False
)
),
"Unsupported parameters."
assert
"cpu"
==
input
.
device
.
type
,
(
"The device of 'input' variable must be on the CPU."
)
# Note: embedding now supports device-side input for graph recording
# The C++ implementation handles both CPU and device-side inputs
if
out
is
None
:
return
Tensor
(
_infinicore
.
embedding
(
input
.
_underlying
,
weight
.
_underlying
))
...
...
python/infinicore/nn/functional/flash_attention.py
0 → 100644
View file @
784139b9
import
math
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
flash_attention
(
query
,
key
,
value
,
total_kv_len
,
attn_mask
=
None
,
dropout_p
=
0
,
is_causal
=
False
,
scale
=
None
,
enable_gqa
=
False
,
):
assert
attn_mask
is
None
and
dropout_p
==
0
and
not
enable_gqa
emb_dim
=
query
.
shape
[
-
1
]
if
scale
is
None
:
scale
=
1
/
math
.
sqrt
(
emb_dim
)
return
Tensor
(
_infinicore
.
flash_attention
(
query
.
_underlying
,
key
.
_underlying
,
value
.
_underlying
,
total_kv_len
.
_underlying
,
scale
,
is_causal
,
)
)
python/infinicore/nn/functional/linear_w8a8i8.py
0 → 100644
View file @
784139b9
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
linear_w8a8i8
(
input
:
Tensor
,
weight_packed
:
Tensor
,
weight_scale
:
Tensor
,
bias
=
None
,
out
=
None
,
)
->
Tensor
:
r
"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
if
out
is
None
:
return
Tensor
(
_infinicore
.
linear_w8a8i8
(
input
.
_underlying
,
weight_packed
.
_underlying
,
weight_scale
.
_underlying
,
None
if
bias
is
None
else
bias
.
_underlying
,
)
)
_infinicore
.
linear_w8a8i8_
(
out
.
_underlying
,
input
.
_underlying
,
weight_packed
.
_underlying
,
weight_scale
.
_underlying
,
None
if
bias
is
None
else
bias
.
_underlying
,
)
return
out
python/infinicore/nn/functional/silu_and_mul.py
0 → 100644
View file @
784139b9
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
silu_and_mul
(
input
:
Tensor
,
out
=
None
)
->
Tensor
:
r
"""Apply the SiLU and Mul (SwiGLU) function.
Formula: output = SiLU(input_gate) * input_up
Input shape: [..., 2*d], Output shape: [..., d]
"""
if
out
is
None
:
return
Tensor
(
_infinicore
.
silu_and_mul
(
input
.
_underlying
))
_infinicore
.
silu_and_mul_
(
out
.
_underlying
,
input
.
_underlying
)
return
out
python/infinicore/ops/add_rms_norm.py
View file @
784139b9
import
infinicore.tensor
as
tensor
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
add_rms_norm
(
a
,
b
,
weight
,
epsilon
=
1e-5
,
*
,
out
=
None
):
def
add_rms_norm
(
a
,
b
,
weight
,
epsilon
=
1e-5
,
*
,
out
=
None
,
residual
=
None
):
"""
Fused Add and RMS Normalization.
...
...
@@ -18,30 +18,17 @@ def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
The add_result can be used as residual for subsequent layers.
"""
if
out
is
None
:
result
=
_infinicore
.
add_rms_norm
(
a
.
_underlying
,
b
.
_underlying
,
weight
.
_underlying
,
epsilon
)
return
(
Tensor
(
result
[
0
]),
Tensor
(
result
[
1
]))
out
=
tensor
.
empty
(
a
.
shape
,
dtype
=
a
.
dtype
,
device
=
a
.
device
)
if
residual
is
None
:
residual
=
tensor
.
empty
(
b
.
shape
,
dtype
=
b
.
dtype
,
device
=
b
.
device
)
y
,
residual_out
=
out
_infinicore
.
add_rms_norm_
(
y
.
_underlying
,
residual
_out
.
_underlying
,
out
.
_underlying
,
residual
.
_underlying
,
a
.
_underlying
,
b
.
_underlying
,
weight
.
_underlying
,
epsilon
,
)
return
(
y
,
residual_out
)
def
add_rms_norm_
(
y
,
residual_out
,
a
,
b
,
weight
,
epsilon
=
1e-5
):
"""In-place Fused Add and RMS Normalization."""
_infinicore
.
add_rms_norm_
(
y
.
_underlying
,
residual_out
.
_underlying
,
a
.
_underlying
,
b
.
_underlying
,
weight
.
_underlying
,
epsilon
,
)
return
out
,
residual
python/infinicore/ops/kv_caching.py
0 → 100644
View file @
784139b9
from
infinicore.lib
import
_infinicore
def
kv_caching
(
k_cache
,
v_cache
,
k
,
v
,
past_kv_lengths
):
_infinicore
.
kv_caching_
(
k_cache
.
_underlying
,
v_cache
.
_underlying
,
k
.
_underlying
,
v
.
_underlying
,
past_kv_lengths
.
_underlying
,
)
return
k_cache
,
v_cache
scripts/build_ntops.py
View file @
784139b9
import
concurrent.futures
import
importlib
import
pathlib
...
...
@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src"
def
_find_and_build_ops
():
ops_path
=
SRC_DIR_PATH
/
"infiniop"
/
"ops"
for
op_dir
in
ops_path
.
iterdir
()
:
ninetoothed_path
=
op_dir
/
"ninetoothed"
with
concurrent
.
futures
.
ProcessPoolExecutor
()
as
executor
:
futures
=
[]
if
ninetoothed_path
.
is_dir
():
module_path
=
ninetoothed_path
/
"build"
relative_path
=
module_path
.
relative_to
(
SRC_DIR_PATH
)
import_name
=
"."
.
join
(
relative_path
.
parts
)
module
=
importlib
.
import_module
(
import_name
)
for
op_dir
in
ops_path
.
iterdir
():
ninetoothed_path
=
op_dir
/
"ninetoothed"
module
.
build
()
if
not
ninetoothed_path
.
is_dir
():
continue
build_file
=
ninetoothed_path
/
"build.py"
if
not
build_file
.
exists
():
continue
futures
.
append
(
executor
.
submit
(
_build
,
ninetoothed_path
))
for
future
in
concurrent
.
futures
.
as_completed
(
futures
):
future
.
result
()
def
_build
(
ninetoothed_path
):
module_path
=
ninetoothed_path
/
"build"
relative_path
=
module_path
.
relative_to
(
SRC_DIR_PATH
)
import_name
=
"."
.
join
(
relative_path
.
parts
)
module
=
importlib
.
import_module
(
import_name
)
module
.
build
()
if
__name__
==
"__main__"
:
...
...
scripts/python_test.py
View file @
784139b9
...
...
@@ -39,6 +39,9 @@ def run_tests(args):
"topkrouter.py"
,
"topksoftmax.py"
,
"zeros.py"
,
# "paged_attention.py",
# "paged_caching.py",
# "paged_attention_prefill.py"
]:
result
=
subprocess
.
run
(
f
"python
{
test
}
{
args
}
--debug"
,
text
=
True
,
encoding
=
"utf-8"
,
shell
=
True
...
...
src/infiniccl-test/main.cpp
View file @
784139b9
...
...
@@ -12,7 +12,7 @@ void printUsage() {
std
::
cout
<<
"infiniccl-test --<device>"
<<
std
::
endl
<<
std
::
endl
;
std
::
cout
<<
" --<device>"
<<
std
::
endl
;
std
::
cout
<<
" Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon)."
<<
std
::
endl
std
::
cout
<<
" Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon
|ali
)."
<<
std
::
endl
<<
std
::
endl
;
std
::
cout
<<
"The program will run tests on all visible devices of the specified device type."
<<
" Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs."
;
...
...
@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
else
PARSE_DEVICE
(
"--qy"
,
INFINI_DEVICE_QY
)
else
PARSE_DEVICE
(
"--kunlun"
,
INFINI_DEVICE_KUNLUN
)
else
PARSE_DEVICE
(
"--hygon"
,
INFINI_DEVICE_HYGON
)
else
PARSE_DEVICE
(
"--ali"
,
INFINI_DEVICE_ALI
)
else
{
printUsage
();
}
...
...
src/infiniccl/cambricon/infiniccl_cambricon.cc
View file @
784139b9
...
...
@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(
for
(
int
i
=
0
;
i
<
ndevice
;
i
++
)
{
rank_list
[
i
]
=
i
;
CHECK_INTERNAL
(
cnrtSetDevice
(
device_ids
[
i
]),
CNRT_RET_SUCCESS
);
CHECK_INTERNAL
(
cnrtSetDevice
(
device_ids
[
i
]),
cnrtSuccess
);
}
CHECK_CNCL
(
cnclInitComms
(
cncl_comms
.
data
(),
ndevice
,
...
...
src/infiniccl/cuda/infiniccl_cuda.h
View file @
784139b9
...
...
@@ -4,7 +4,7 @@
#include "../infiniccl_impl.h"
// Windows does not support CUDA
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
|| defined(ENABLE_ALI_API)
) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL
(
cuda
)
#else
INFINICCL_DEVICE_API_NOOP
(
cuda
)
...
...
src/infiniccl/infiniccl.cc
View file @
784139b9
...
...
@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll(
COMM_INIT_ALL
(
INFINI_DEVICE_METAX
,
metax
);
COMM_INIT_ALL
(
INFINI_DEVICE_MOORE
,
moore
);
COMM_INIT_ALL
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
COMM_INIT_ALL
(
INFINI_DEVICE_ALI
,
cuda
);
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
COMM_DESTROY
(
INFINI_DEVICE_METAX
,
metax
);
COMM_DESTROY
(
INFINI_DEVICE_MOORE
,
moore
);
COMM_DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
COMM_DESTROY
(
INFINI_DEVICE_ALI
,
cuda
);
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce(
ALL_REDUCE
(
INFINI_DEVICE_METAX
,
metax
);
ALL_REDUCE
(
INFINI_DEVICE_MOORE
,
moore
);
ALL_REDUCE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
ALL_REDUCE
(
INFINI_DEVICE_ALI
,
cuda
);
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infinicore-test/README.md
View file @
784139b9
...
...
@@ -66,6 +66,7 @@ xmake build infinicore-test
./infinicore-test
--qy
./infinicore-test
--kunlun
./infinicore-test
--hygon
./infinicore-test
--ali
```
### Customize Test Parameters
...
...
Prev
1
2
3
4
5
6
7
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment