Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
52f0dcf0
Unverified
Commit
52f0dcf0
authored
Feb 12, 2026
by
thatPepe
Committed by
GitHub
Feb 12, 2026
Browse files
Merge pull request #1019 from InfiniTensor/issue/1008
Issue/1008
parents
d0f405ce
68026bd1
Changes
292
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
148 additions
and
32 deletions
+148
-32
scripts/python_test.py
scripts/python_test.py
+3
-0
src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
+2
-0
src/infiniop/ops/layer_norm/operator.cc
src/infiniop/ops/layer_norm/operator.cc
+3
-0
src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu
src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu
+5
-0
src/infiniop/ops/logsoftmax/operator.cc
src/infiniop/ops/logsoftmax/operator.cc
+4
-4
src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu
src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu
+2
-0
src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh
src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh
+49
-0
src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh
src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh
+18
-14
src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
...ttention_prefill/nvidia/paged_attention_prefill_nvidia.cu
+5
-0
src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
+20
-3
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
+2
-0
src/infiniop/ops/sigmoid/operator.cc
src/infiniop/ops/sigmoid/operator.cc
+13
-4
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
+3
-0
src/infiniop/ops/topksoftmax/operator.cc
src/infiniop/ops/topksoftmax/operator.cc
+13
-1
test/infinicore/ops/abs.py
test/infinicore/ops/abs.py
+1
-1
test/infinicore/ops/acos.py
test/infinicore/ops/acos.py
+1
-1
test/infinicore/ops/acosh.py
test/infinicore/ops/acosh.py
+1
-1
test/infinicore/ops/adaptive_avg_pool1d.py
test/infinicore/ops/adaptive_avg_pool1d.py
+1
-1
test/infinicore/ops/adaptive_avg_pool2d.py
test/infinicore/ops/adaptive_avg_pool2d.py
+1
-1
test/infinicore/ops/adaptive_avg_pool3d.py
test/infinicore/ops/adaptive_avg_pool3d.py
+1
-1
No files found.
scripts/python_test.py
View file @
52f0dcf0
...
@@ -39,6 +39,9 @@ def run_tests(args):
...
@@ -39,6 +39,9 @@ def run_tests(args):
"topkrouter.py"
,
"topkrouter.py"
,
"topksoftmax.py"
,
"topksoftmax.py"
,
"zeros.py"
,
"zeros.py"
,
# "paged_attention.py",
# "paged_caching.py",
# "paged_attention_prefill.py"
]:
]:
result
=
subprocess
.
run
(
result
=
subprocess
.
run
(
f
"python
{
test
}
{
args
}
--debug"
,
text
=
True
,
encoding
=
"utf-8"
,
shell
=
True
f
"python
{
test
}
{
args
}
--debug"
,
text
=
True
,
encoding
=
"utf-8"
,
shell
=
True
...
...
src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
View file @
52f0dcf0
...
@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
...
@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_2048
)
}
else
{
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
}
...
...
src/infiniop/ops/layer_norm/operator.cc
View file @
52f0dcf0
...
@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
...
@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu
View file @
52f0dcf0
...
@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
...
@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
y
,
x
,
_info
.
x_dtype
,
_info
.
y_dtype
,
_info
.
batch_size
,
_info
.
probs_size
,
_info
.
ndim
,
_info
.
seq_len
,
y
,
x
,
_info
.
x_dtype
,
_info
.
y_dtype
,
_info
.
batch_size
,
_info
.
probs_size
,
_info
.
ndim
,
_info
.
seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_p
,
_info
.
x_stride_b
,
_info
.
x_stride_p
,
_info
.
y_stride_b
,
_info
.
y_stride_p
,
_info
.
x_stride_b
,
_info
.
x_stride_p
,
_info
.
y_stride_0
,
_info
.
y_stride_1
,
_info
.
x_stride_0
,
_info
.
x_stride_1
,
stream
));
_info
.
y_stride_0
,
_info
.
y_stride_1
,
_info
.
x_stride_0
,
_info
.
x_stride_1
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
y
,
x
,
_info
.
x_dtype
,
_info
.
y_dtype
,
_info
.
batch_size
,
_info
.
probs_size
,
_info
.
ndim
,
_info
.
seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_p
,
_info
.
x_stride_b
,
_info
.
x_stride_p
,
_info
.
y_stride_0
,
_info
.
y_stride_1
,
_info
.
x_stride_0
,
_info
.
x_stride_1
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
x_dtype
,
_info
.
y_dtype
,
_info
.
batch_size
,
_info
.
probs_size
,
_info
.
ndim
,
_info
.
seq_len
,
y
,
x
,
_info
.
x_dtype
,
_info
.
y_dtype
,
_info
.
batch_size
,
_info
.
probs_size
,
_info
.
ndim
,
_info
.
seq_len
,
...
...
src/infiniop/ops/logsoftmax/operator.cc
View file @
52f0dcf0
...
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
...
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_ILUVATAR_API
//
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
...
@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
...
@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_ILUVATAR_API
//
GET(INFINI_DEVICE_ILUVATAR, nvidia);
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
GET
(
INFINI_DEVICE_QY
,
nvidia
);
...
@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
...
@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_ILUVATAR_API
//
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
...
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
...
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_ILUVATAR_API
//
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
);
...
...
src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu
View file @
52f0dcf0
...
@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
...
@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_1024
)
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_1024
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_2048
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
CALCULATE_LP_NORM_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
}
else
{
}
else
{
...
...
src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh
View file @
52f0dcf0
...
@@ -16,17 +16,66 @@ struct OnlineSoftmaxState {
...
@@ -16,17 +16,66 @@ struct OnlineSoftmaxState {
}
}
};
};
__device__
__forceinline__
float
warpReduceSum
(
float
x
)
{
__device__
__forceinline__
float
warpReduceSum
(
float
x
)
{
#if defined(ENABLE_ILUVATAR_API)
// Iluvatar may use warp size 64; __shfl_sync(0xffffffff) only covers 32 threads.
// Use shared-memory tree reduce for portability across warp sizes.
constexpr
int
kMaxWarps
=
16
;
__shared__
float
_reduce_buf
[
kMaxWarps
*
32
];
const
int
lane
=
threadIdx
.
x
&
31
;
const
int
warp_id
=
threadIdx
.
x
/
32
;
_reduce_buf
[
threadIdx
.
x
]
=
x
;
__syncthreads
();
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
if
(
lane
<
offset
)
{
_reduce_buf
[
warp_id
*
32
+
lane
]
+=
_reduce_buf
[
warp_id
*
32
+
lane
+
offset
];
}
__syncthreads
();
}
return
_reduce_buf
[
warp_id
*
32
];
#else
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
+=
__shfl_down_sync
(
0xffffffff
,
x
,
offset
);
x
+=
__shfl_down_sync
(
0xffffffff
,
x
,
offset
);
}
}
return
x
;
return
x
;
#endif
}
__device__
__forceinline__
float
warpBroadcast
(
float
x
,
int
src_lane
)
{
#if defined(ENABLE_ILUVATAR_API)
__shared__
float
_bcast_buf
[
16
];
const
int
warp_id
=
threadIdx
.
x
/
32
;
if
((
threadIdx
.
x
&
31
)
==
src_lane
)
{
_bcast_buf
[
warp_id
]
=
x
;
}
__syncthreads
();
return
_bcast_buf
[
warp_id
];
#else
return
__shfl_sync
(
0xffffffff
,
x
,
src_lane
);
#endif
}
}
__device__
__forceinline__
float
warpReduceMax
(
float
x
)
{
__device__
__forceinline__
float
warpReduceMax
(
float
x
)
{
#if defined(ENABLE_ILUVATAR_API)
__shared__
float
_reduce_buf
[
16
*
32
];
const
int
lane
=
threadIdx
.
x
&
31
;
const
int
warp_id
=
threadIdx
.
x
/
32
;
_reduce_buf
[
threadIdx
.
x
]
=
x
;
__syncthreads
();
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
if
(
lane
<
offset
)
{
float
other
=
_reduce_buf
[
warp_id
*
32
+
lane
+
offset
];
float
cur
=
_reduce_buf
[
warp_id
*
32
+
lane
];
_reduce_buf
[
warp_id
*
32
+
lane
]
=
fmaxf
(
cur
,
other
);
}
__syncthreads
();
}
return
_reduce_buf
[
warp_id
*
32
];
#else
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
for
(
int
offset
=
16
;
offset
>
0
;
offset
>>=
1
)
{
x
=
fmaxf
(
x
,
__shfl_down_sync
(
0xffffffff
,
x
,
offset
));
x
=
fmaxf
(
x
,
__shfl_down_sync
(
0xffffffff
,
x
,
offset
));
}
}
return
x
;
return
x
;
#endif
}
}
__device__
__forceinline__
unsigned
int
cvtaToShared
(
const
void
*
ptr
)
{
__device__
__forceinline__
unsigned
int
cvtaToShared
(
const
void
*
ptr
)
{
...
...
src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh
View file @
52f0dcf0
#ifndef __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#ifndef __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#define __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#define __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API)
|| defined(ENABLE_ILUVATAR_API)
#include <cuda_bf16.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
...
@@ -194,8 +194,8 @@ __device__ void PagedAttentionPrefillWarpKernel(
...
@@ -194,8 +194,8 @@ __device__ void PagedAttentionPrefillWarpKernel(
l
=
l
*
alpha
+
beta
;
l
=
l
*
alpha
+
beta
;
m
=
m_new
;
m
=
m_new
;
}
}
alpha
=
__shfl_sync
(
0xffffffff
,
alpha
,
0
);
alpha
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
alpha
,
0
);
beta
=
__shfl_sync
(
0xffffffff
,
beta
,
0
);
beta
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
beta
,
0
);
#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__)
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
...
@@ -233,7 +233,7 @@ __device__ void PagedAttentionPrefillWarpKernel(
...
@@ -233,7 +233,7 @@ __device__ void PagedAttentionPrefillWarpKernel(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
...
@@ -411,8 +411,8 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
...
@@ -411,8 +411,8 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
l
=
l
*
alpha
+
beta
;
l
=
l
*
alpha
+
beta
;
m
=
m_new
;
m
=
m_new
;
}
}
alpha
=
__shfl_sync
(
0xffffffff
,
alpha
,
0
);
alpha
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
alpha
,
0
);
beta
=
__shfl_sync
(
0xffffffff
,
beta
,
0
);
beta
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
beta
,
0
);
#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__)
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
...
@@ -450,7 +450,11 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
...
@@ -450,7 +450,11 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
#ifdef ENABLE_ILUVATAR_API
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
#else
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
#endif
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
...
@@ -785,8 +789,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
...
@@ -785,8 +789,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
l
=
l
*
alpha
+
beta
;
l
=
l
*
alpha
+
beta
;
m
=
m_new
;
m
=
m_new
;
}
}
alpha
=
__shfl_sync
(
0xffffffff
,
alpha
,
0
);
alpha
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
alpha
,
0
);
beta
=
__shfl_sync
(
0xffffffff
,
beta
,
0
);
beta
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
beta
,
0
);
#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__)
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
...
@@ -826,7 +830,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
...
@@ -826,7 +830,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
...
@@ -1270,7 +1274,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelPipelined(
...
@@ -1270,7 +1274,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelPipelined(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
...
@@ -1961,8 +1965,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
...
@@ -1961,8 +1965,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
l
=
l
*
alpha
+
beta
;
l
=
l
*
alpha
+
beta
;
m
=
m_new
;
m
=
m_new
;
}
}
alpha
=
__shfl_sync
(
0xffffffff
,
alpha
,
0
);
alpha
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
alpha
,
0
);
beta
=
__shfl_sync
(
0xffffffff
,
beta
,
0
);
beta
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
beta
,
0
);
#if defined(__CUDA_ARCH__)
#if defined(__CUDA_ARCH__)
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
...
@@ -2002,7 +2006,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
...
@@ -2002,7 +2006,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
for
(
int
i
=
0
;
i
<
DIMS_PER_THREAD
;
++
i
)
{
...
@@ -2131,7 +2135,7 @@ __device__ __forceinline__ void PagedAttentionPrefillMmaScoreWriteRow(
...
@@ -2131,7 +2135,7 @@ __device__ __forceinline__ void PagedAttentionPrefillMmaScoreWriteRow(
if
(
lane
==
0
)
{
if
(
lane
==
0
)
{
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
inv_l
=
1.0
f
/
(
l
+
1e-6
f
);
}
}
inv_l
=
__shfl_sync
(
0xffffffff
,
inv_l
,
0
);
inv_l
=
op
::
paged_attention
::
cuda
::
warpBroadcast
(
inv_l
,
0
);
const
int64_t
q_token
=
q_start
+
static_cast
<
int64_t
>
(
q_token_local
);
const
int64_t
q_token
=
q_start
+
static_cast
<
int64_t
>
(
q_token_local
);
half
*
out_ptr
=
out_
+
q_token
*
o_stride
+
static_cast
<
int64_t
>
(
head_idx
)
*
o_head_stride
;
half
*
out_ptr
=
out_
+
q_token
*
o_stride
+
static_cast
<
int64_t
>
(
head_idx
)
*
o_head_stride
;
...
...
src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
View file @
52f0dcf0
...
@@ -21,6 +21,11 @@ constexpr size_t ceilDiv(size_t a, size_t b) {
...
@@ -21,6 +21,11 @@ constexpr size_t ceilDiv(size_t a, size_t b) {
}
}
inline
const
char
*
default_prefill_kernel
(
const
PagedAttentionPrefillInfo
&
info
)
{
inline
const
char
*
default_prefill_kernel
(
const
PagedAttentionPrefillInfo
&
info
)
{
// Iluvatar: use warp (stable). Users can override via INFINIOP_FLASH_PREFILL_KERNEL.
#ifdef ENABLE_ILUVATAR_API
(
void
)
info
;
return
"warp"
;
#endif
// Heuristic auto-dispatch (v0.4):
// Heuristic auto-dispatch (v0.4):
// - Prefer the pipelined + tile-wise softmax kernel on FA2-compatible block_size=256.
// - Prefer the pipelined + tile-wise softmax kernel on FA2-compatible block_size=256.
// - Keep a conservative fallback for other shapes / older GPUs (cp.async is a no-op below SM80).
// - Keep a conservative fallback for other shapes / older GPUs (cp.async is a no-op below SM80).
...
...
src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
View file @
52f0dcf0
...
@@ -8,8 +8,8 @@
...
@@ -8,8 +8,8 @@
#define ARRAY_TYPE_SIZE size_t
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE
5
#define MAX_BLOCK_ARRAY_SIZE
6
#define MAX_GRID_ARRAY_SIZE
5
#define MAX_GRID_ARRAY_SIZE
6
template
<
int
ArrSize
,
typename
ArrayType
>
template
<
int
ArrSize
,
typename
ArrayType
>
struct
ArrayStruct
{
struct
ArrayStruct
{
...
@@ -185,32 +185,43 @@ struct Constraint {
...
@@ -185,32 +185,43 @@ struct Constraint {
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-
5
和1-
5
的所有组合生成内核
// 为1-
6
和1-
6
的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
6
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
6
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
6
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
6
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
6
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
6
,
6
)
// 准备参数结构体
// 准备参数结构体
struct
RearrangeParams
{
struct
RearrangeParams
{
...
@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
...
@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
case 5: \
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \
break; \
}
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
...
@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
...
@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
case 5: \
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \
break; \
}
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
...
...
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
View file @
52f0dcf0
...
@@ -64,6 +64,7 @@ infiniStatus_t Descriptor::create(
...
@@ -64,6 +64,7 @@ infiniStatus_t Descriptor::create(
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
#ifdef ENABLE_QY_API
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
>
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
>
infiniStatus_t
Descriptor
::
launchKernel
(
const
I8GemmInfo
&
info
,
Tdata
*
y
,
const
Tdata
*
bias
,
const
int8_t
*
x_packed
,
const
float
*
x_scale
,
const
int8_t
*
w_packed
,
const
float
*
w_scale
,
void
*
stream_
,
void
*
workspace
)
const
{
infiniStatus_t
Descriptor
::
launchKernel
(
const
I8GemmInfo
&
info
,
Tdata
*
y
,
const
Tdata
*
bias
,
const
int8_t
*
x_packed
,
const
float
*
x_scale
,
const
int8_t
*
w_packed
,
const
float
*
w_scale
,
void
*
stream_
,
void
*
workspace
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
...
@@ -112,6 +113,7 @@ infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const
...
@@ -112,6 +113,7 @@ infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
#endif
infiniStatus_t
Descriptor
::
calculate
(
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
void
*
workspace
,
...
...
src/infiniop/ops/sigmoid/operator.cc
View file @
52f0dcf0
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#include "cpu/sigmoid_cpu.h"
#endif
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
|| defined(ENABLE_ILUVATAR_API)
#include "nvidia/sigmoid_nvidia.cuh"
#include "nvidia/sigmoid_nvidia.cuh"
#endif
#endif
...
@@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
...
@@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -65,7 +68,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
...
@@ -65,7 +68,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
@@ -101,7 +106,9 @@ __C infiniStatus_t infiniopSigmoid(
...
@@ -101,7 +106,9 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
@@ -131,7 +138,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
...
@@ -131,7 +138,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
}
...
...
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
View file @
52f0dcf0
...
@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
...
@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
othersize
,
_info
.
dimsize
,
_info
.
stride
,
stream
));
y
,
x
,
_info
.
dtype
,
_info
.
othersize
,
_info
.
dimsize
,
_info
.
stride
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
y
,
x
,
_info
.
dtype
,
_info
.
othersize
,
_info
.
dimsize
,
_info
.
stride
,
stream
));
}
else
{
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
}
...
...
src/infiniop/ops/topksoftmax/operator.cc
View file @
52f0dcf0
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#include "cpu/topksoftmax_cpu.h"
#endif
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
|| defined(ENABLE_ILUVATAR_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
...
@@ -36,6 +36,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
...
@@ -36,6 +36,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
#endif
#endif
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
}
}
...
@@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
...
@@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
#endif
#endif
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
}
}
...
@@ -101,6 +107,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
...
@@ -101,6 +107,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
#endif
#endif
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
}
}
...
@@ -131,6 +140,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
...
@@ -131,6 +140,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
#endif
#endif
#ifdef ENABLE_ALI_API
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
}
}
...
...
test/infinicore/ops/abs.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
BaseOperatorTest
,
TensorSpec
,
TestCase
,
GenericTestRunner
from
framework
import
BaseOperatorTest
,
TensorSpec
,
TestCase
,
GenericTestRunner
# Test cases format: (in_shape, in_strides_or_None)
# Test cases format: (in_shape, in_strides_or_None)
...
...
test/infinicore/ops/acos.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
(
from
framework
import
(
BaseOperatorTest
,
BaseOperatorTest
,
TensorSpec
,
TensorSpec
,
...
...
test/infinicore/ops/acosh.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
(
from
framework
import
(
BaseOperatorTest
,
BaseOperatorTest
,
TensorSpec
,
TensorSpec
,
...
...
test/infinicore/ops/adaptive_avg_pool1d.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
(
from
framework
import
(
BaseOperatorTest
,
BaseOperatorTest
,
TensorSpec
,
TensorSpec
,
...
...
test/infinicore/ops/adaptive_avg_pool2d.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
(
from
framework
import
(
BaseOperatorTest
,
BaseOperatorTest
,
TensorSpec
,
TensorSpec
,
...
...
test/infinicore/ops/adaptive_avg_pool3d.py
View file @
52f0dcf0
...
@@ -3,8 +3,8 @@ import os
...
@@ -3,8 +3,8 @@ import os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
import
infinicore
import
torch
from
framework
import
(
from
framework
import
(
BaseOperatorTest
,
BaseOperatorTest
,
TensorSpec
,
TensorSpec
,
...
...
Prev
1
2
3
4
5
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment