Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
0166515c
Unverified
Commit
0166515c
authored
Aug 07, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 07, 2025
Browse files
Merge branch 'main' into issue/300
parents
f0300ff3
a23c4d13
Changes
175
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
337 additions
and
129 deletions
+337
-129
src/infiniop-test/include/tensor.hpp
src/infiniop-test/include/tensor.hpp
+4
-2
src/infiniop-test/include/utils.hpp
src/infiniop-test/include/utils.hpp
+9
-1
src/infiniop-test/src/tensor.cpp
src/infiniop-test/src/tensor.cpp
+44
-0
src/infiniop/devices/handle.cc
src/infiniop/devices/handle.cc
+15
-9
src/infiniop/devices/metax/metax_common.h
src/infiniop/devices/metax/metax_common.h
+3
-3
src/infiniop/devices/metax/metax_handle.cc
src/infiniop/devices/metax/metax_handle.cc
+3
-3
src/infiniop/devices/metax/metax_handle.h
src/infiniop/devices/metax/metax_handle.h
+5
-5
src/infiniop/devices/metax/metax_kernel_common.h
src/infiniop/devices/metax/metax_kernel_common.h
+17
-11
src/infiniop/devices/nvidia/nvidia_common.cu
src/infiniop/devices/nvidia/nvidia_common.cu
+20
-7
src/infiniop/devices/nvidia/nvidia_common.cuh
src/infiniop/devices/nvidia/nvidia_common.cuh
+5
-3
src/infiniop/devices/nvidia/nvidia_handle.cuh
src/infiniop/devices/nvidia/nvidia_handle.cuh
+11
-4
src/infiniop/devices/nvidia/nvidia_handle.h
src/infiniop/devices/nvidia/nvidia_handle.h
+12
-5
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
+8
-6
src/infiniop/elementwise/metax/elementwise_metax.h
src/infiniop/elementwise/metax/elementwise_metax.h
+16
-16
src/infiniop/elementwise/metax/elementwise_metax_api.h
src/infiniop/elementwise/metax/elementwise_metax_api.h
+21
-21
src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
src/infiniop/elementwise/nvidia/elementwise_nvidia.cuh
+10
-10
src/infiniop/elementwise/nvidia/elementwise_nvidia_api.cuh
src/infiniop/elementwise/nvidia/elementwise_nvidia_api.cuh
+18
-18
src/infiniop/ninetoothed/build.py
src/infiniop/ninetoothed/build.py
+112
-0
src/infiniop/ops/add/cpu/add_cpu.cc
src/infiniop/ops/add/cpu/add_cpu.cc
+3
-1
src/infiniop/ops/add/cuda/kernel.cuh
src/infiniop/ops/add/cuda/kernel.cuh
+1
-4
No files found.
src/infiniop-test/include/tensor.hpp
View file @
0166515c
...
...
@@ -6,6 +6,8 @@
inline
infiniDtype_t
ggmlTypeToInfiniType
(
GGML_TYPE
type
)
{
switch
(
type
)
{
case
GGML_TYPE_Q8_K
:
return
INFINI_DTYPE_BOOL
;
case
GGML_TYPE_I8
:
return
INFINI_DTYPE_I8
;
case
GGML_TYPE_I16
:
...
...
@@ -14,10 +16,10 @@ inline infiniDtype_t ggmlTypeToInfiniType(GGML_TYPE type) {
return
INFINI_DTYPE_I32
;
case
GGML_TYPE_I64
:
return
INFINI_DTYPE_I64
;
case
GGML_TYPE_F16
:
return
INFINI_DTYPE_F16
;
case
GGML_TYPE_BF16
:
return
INFINI_DTYPE_BF16
;
case
GGML_TYPE_F16
:
return
INFINI_DTYPE_F16
;
case
GGML_TYPE_F32
:
return
INFINI_DTYPE_F32
;
case
GGML_TYPE_F64
:
...
...
src/infiniop-test/include/utils.hpp
View file @
0166515c
...
...
@@ -9,12 +9,16 @@
inline
double
getVal
(
void
*
ptr
,
GGML_TYPE
ggml_type
)
{
switch
(
ggml_type
)
{
case
GGML_TYPE_BF16
:
return
utils
::
cast
<
float
>
(
*
(
bf16_t
*
)
ptr
);
case
GGML_TYPE_F16
:
return
utils
::
cast
<
double
>
(
*
(
fp16_t
*
)
ptr
);
return
utils
::
cast
<
float
>
(
*
(
fp16_t
*
)
ptr
);
case
GGML_TYPE_F32
:
return
*
(
float
*
)
ptr
;
case
GGML_TYPE_F64
:
return
*
(
double
*
)
ptr
;
case
GGML_TYPE_Q8_K
:
return
*
(
bool
*
)
ptr
;
case
GGML_TYPE_I8
:
return
*
(
int8_t
*
)
ptr
;
case
GGML_TYPE_I16
:
...
...
@@ -30,12 +34,16 @@ inline double getVal(void *ptr, GGML_TYPE ggml_type) {
inline
size_t
ggmlSizeOf
(
GGML_TYPE
ggml_type
)
{
switch
(
ggml_type
)
{
case
GGML_TYPE_BF16
:
return
sizeof
(
bf16_t
);
case
GGML_TYPE_F16
:
return
sizeof
(
fp16_t
);
case
GGML_TYPE_F32
:
return
sizeof
(
float
);
case
GGML_TYPE_F64
:
return
sizeof
(
double
);
case
GGML_TYPE_Q8_K
:
return
sizeof
(
bool
);
case
GGML_TYPE_I8
:
return
sizeof
(
int8_t
);
case
GGML_TYPE_I16
:
...
...
src/infiniop-test/src/tensor.cpp
View file @
0166515c
#include "tensor.hpp"
#include "gguf.hpp"
#include "utils.hpp"
#include <cstring>
#include <infinirt.h>
...
...
@@ -19,6 +20,40 @@ void printData(const T *data, const std::vector<size_t> &shape, const std::vecto
}
}
// The type int8_t is represented by signed char, with a range of –128 to 127.
// It may contain non-printable characters and thus cannot be printed directly.
template
<
>
void
printData
(
const
int8_t
*
data
,
const
std
::
vector
<
size_t
>
&
shape
,
const
std
::
vector
<
ptrdiff_t
>
&
strides
,
size_t
dim
)
{
if
(
dim
==
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
std
::
cout
<<
static_cast
<
int
>
(
*
(
data
+
i
*
strides
[
dim
]))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
else
if
(
dim
<
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
printData
(
data
+
i
*
strides
[
dim
],
shape
,
strides
,
dim
+
1
);
std
::
cout
<<
std
::
endl
;
}
}
}
template
<
>
void
printData
(
const
bf16_t
*
data
,
const
std
::
vector
<
size_t
>
&
shape
,
const
std
::
vector
<
ptrdiff_t
>
&
strides
,
size_t
dim
)
{
if
(
dim
==
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
std
::
cout
<<
utils
::
cast
<
float
>
(
*
(
data
+
i
*
strides
[
dim
]))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
else
if
(
dim
<
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
printData
(
data
+
i
*
strides
[
dim
],
shape
,
strides
,
dim
+
1
);
std
::
cout
<<
std
::
endl
;
}
}
}
template
<
>
void
printData
(
const
fp16_t
*
data
,
const
std
::
vector
<
size_t
>
&
shape
,
const
std
::
vector
<
ptrdiff_t
>
&
strides
,
size_t
dim
)
{
...
...
@@ -26,6 +61,7 @@ void printData(const fp16_t *data, const std::vector<size_t> &shape,
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
std
::
cout
<<
utils
::
cast
<
float
>
(
*
(
data
+
i
*
strides
[
dim
]))
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
else
if
(
dim
<
shape
.
size
()
-
1
)
{
for
(
size_t
i
=
0
;
i
<
shape
[
dim
];
i
++
)
{
printData
(
data
+
i
*
strides
[
dim
],
shape
,
strides
,
dim
+
1
);
...
...
@@ -227,6 +263,8 @@ void Tensor::debug() const {
auto
tensor
=
to
(
INFINI_DEVICE_CPU
,
0
);
std
::
cout
<<
"Tensor: "
<<
tensor
->
info
()
<<
std
::
endl
;
switch
(
_ggml_type
)
{
case
GGML_TYPE_BF16
:
printData
((
bf16_t
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
case
GGML_TYPE_F16
:
printData
((
fp16_t
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
...
...
@@ -236,6 +274,9 @@ void Tensor::debug() const {
case
GGML_TYPE_F64
:
printData
((
double
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
case
GGML_TYPE_Q8_K
:
printData
((
bool
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
case
GGML_TYPE_I8
:
printData
((
int8_t
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
...
...
@@ -245,6 +286,9 @@ void Tensor::debug() const {
case
GGML_TYPE_I32
:
printData
((
int32_t
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
case
GGML_TYPE_I64
:
printData
((
int64_t
*
)(
tensor
->
data
()),
_shape
,
_strides
,
0
);
break
;
default:
std
::
cout
<<
"Unsupported GGML type"
<<
std
::
endl
;
break
;
...
...
src/infiniop/devices/handle.cc
View file @
0166515c
...
...
@@ -5,8 +5,8 @@
#ifdef ENABLE_CPU_API
#include "cpu/cpu_handle.h"
#endif
#ifdef
ENABLE_CUDA
_API
#include "
cuda/cud
a_handle.h"
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
nvidia/nvidi
a_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/bang_handle.h"
...
...
@@ -21,7 +21,7 @@
#include "kunlun/kunlun_handle.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca/maca
_handle.h"
#include "m
etax/metax
_handle.h"
#endif
__C
infiniStatus_t
infiniopCreateHandle
(
infiniopHandle_t
*
handle_ptr
)
{
...
...
@@ -41,8 +41,11 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
::
nvidia
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
...
...
@@ -57,7 +60,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
default:
...
...
@@ -78,8 +81,11 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
::
nvidia
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
::
cambricon
);
...
...
@@ -94,7 +100,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/devices/m
aca/
common
_maca
.h
→
src/infiniop/devices/m
etax/metax_
common.h
View file @
0166515c
#include "../../../utils.h"
#include "../pool.h"
#include "m
aca
_handle.h"
#include "m
etax
_handle.h"
#include <hcblas/hcblas.h>
#include <hcdnn/hcdnn.h>
#include <memory>
...
...
@@ -8,7 +8,7 @@
#define CHECK_MCBLAS(API) CHECK_INTERNAL(API, HCBLAS_STATUS_SUCCESS)
#define CHECK_MCDNN(API) CHECK_INTERNAL(API, HCDNN_STATUS_SUCCESS)
namespace
device
::
m
aca
{
namespace
device
::
m
etax
{
class
Handle
::
Internal
{
Pool
<
hcblasHandle_t
>
mcblas_handles
;
...
...
@@ -39,4 +39,4 @@ public:
hcdnnDataType_t
getHcdnnDtype
(
infiniDtype_t
dt
);
}
// namespace device::m
aca
}
// namespace device::m
etax
src/infiniop/devices/m
aca/maca
_handle.cc
→
src/infiniop/devices/m
etax/metax
_handle.cc
View file @
0166515c
#include "common
_maca
.h"
#include "
metax_
common.h"
namespace
device
::
m
aca
{
namespace
device
::
m
etax
{
Handle
::
Handle
(
infiniDevice_t
device
,
int
device_id
)
:
InfiniopHandle
{
device
,
device_id
},
_internal
(
std
::
make_shared
<
Handle
::
Internal
>
(
device_id
))
{}
...
...
@@ -83,4 +83,4 @@ infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace device::m
aca
}
// namespace device::m
etax
src/infiniop/devices/m
aca/maca
_handle.h
→
src/infiniop/devices/m
etax/metax
_handle.h
View file @
0166515c
#ifndef __INFINIOP_M
ACA
_HANDLE_H__
#define __INFINIOP_M
ACA
_HANDLE_H__
#ifndef __INFINIOP_M
ETAX
_HANDLE_H__
#define __INFINIOP_M
ETAX
_HANDLE_H__
#include "../../handle.h"
#include <memory>
namespace
device
::
m
aca
{
namespace
device
::
m
etax
{
struct
Handle
:
public
InfiniopHandle
{
Handle
(
int
device_id
);
class
Internal
;
...
...
@@ -20,6 +20,6 @@ private:
std
::
shared_ptr
<
Internal
>
_internal
;
};
}
// namespace device::m
aca
}
// namespace device::m
etax
#endif // __INFINIOP_M
ACA
_HANDLE_H__
#endif // __INFINIOP_M
ETAX
_HANDLE_H__
src/infiniop/devices/m
aca/maca
_kernel_common.h
→
src/infiniop/devices/m
etax/metax
_kernel_common.h
View file @
0166515c
#define INFINIOP_MACA_KERNEL __global__ void
// Posible maximum number of threads per block for MACA architectures
#define INFINIOP_METAX_KERNEL __global__ void
// Posible maximum number of threads per block for METAX architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
#define MACA_BLOCK_SIZE_512 512
#define METAX_BLOCK_SIZE_1024 1024
#define METAX_BLOCK_SIZE_512 512
#define CHECK_METAX(API) CHECK_INTERNAL(API, hcSuccess)
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using
cuda_bfloat16
=
hpcc_bfloat16
;
using
cuda_bfloat162
=
hpcc_bfloat162
;
namespace
device
::
m
aca
{
namespace
device
::
m
etax
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
...
...
@@ -37,10 +41,8 @@ indexToOffset(
}
return
res
;
}
}
// namespace device::m
aca
}
// namespace device::m
etax
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
...
...
@@ -48,7 +50,7 @@ exp_(const float val) {
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
exp
l
(
val
);
return
exp
(
val
);
}
__forceinline__
__device__
double
...
...
@@ -60,4 +62,8 @@ __forceinline__ __device__ __half
exp_
(
const
__half
x
)
{
return
hexp
(
x
);
}
#endif
__forceinline__
__device__
__hpcc_bfloat16
exp_
(
const
__hpcc_bfloat16
x
)
{
return
hexp
(
x
);
}
src/infiniop/devices/
cuda/cud
a_common.cu
→
src/infiniop/devices/
nvidia/nvidi
a_common.cu
View file @
0166515c
#include "
cud
a_handle.cuh"
#include "
nvidi
a_handle.cuh"
namespace
device
::
cuda
{
namespace
device
{
namespace
nvidia
{
Handle
::
Handle
(
infiniDevice_t
device
,
int
device_id
)
:
InfiniopHandle
{
device
,
device_id
},
...
...
@@ -34,6 +36,7 @@ infiniStatus_t Handle::Internal::useCublas(cudaStream_t stream, const Fn<cublasH
return
INFINI_STATUS_SUCCESS
;
}
#ifdef ENABLE_CUDNN_API
infiniStatus_t
Handle
::
Internal
::
useCudnn
(
cudaStream_t
stream
,
const
Fn
<
cudnnHandle_t
>
&
f
)
const
{
auto
handle
=
dnn_handles
.
pop
();
if
(
!
handle
)
{
...
...
@@ -44,6 +47,7 @@ infiniStatus_t Handle::Internal::useCudnn(cudaStream_t stream, const Fn<cudnnHan
dnn_handles
.
push
(
std
::
move
(
*
handle
));
return
INFINI_STATUS_SUCCESS
;
}
#endif
int
Handle
::
Internal
::
warpSize
()
const
{
return
_warp_size
;
}
int
Handle
::
Internal
::
maxThreadsPerBlock
()
const
{
return
_max_threads_per_block
;
}
...
...
@@ -54,6 +58,7 @@ int Handle::Internal::gridSizeX() const { return _grid_size[0]; }
int
Handle
::
Internal
::
gridSizeY
()
const
{
return
_grid_size
[
1
];
}
int
Handle
::
Internal
::
gridSizeZ
()
const
{
return
_grid_size
[
2
];
}
#ifdef ENABLE_CUDNN_API
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
)
{
switch
(
dt
)
{
case
INFINI_DTYPE_F16
:
...
...
@@ -68,7 +73,7 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
return
CUDNN_DATA_INT8
;
case
INFINI_DTYPE_I32
:
return
CUDNN_DATA_INT32
;
#ifndef ENABLE_ILUVATAR_
CUDA_
API
#ifndef ENABLE_ILUVATAR_API
case
INFINI_DTYPE_I64
:
return
CUDNN_DATA_INT64
;
#endif
...
...
@@ -78,17 +83,25 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
return
CUDNN_DATA_FLOAT
;
}
}
#endif
namespace
nvidia
{
infiniStatus_t
Handle
::
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
)
{
*
handle_ptr
=
new
Handle
(
INFINI_DEVICE_NVIDIA
,
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace nvidia
namespace
iluvatar
{
Handle
::
Handle
(
int
device_id
)
:
cud
a
::
Handle
(
INFINI_DEVICE_
NVIDIA
,
device_id
)
{}
:
nvidi
a
::
Handle
(
INFINI_DEVICE_
ILUVATAR
,
device_id
)
{}
infiniStatus_t
Handle
::
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
)
{
*
handle_ptr
=
new
Handle
(
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
nvidia
}
// namespace
iluvatar
}
// namespace device
::cuda
}
// namespace device
src/infiniop/devices/
cuda/cud
a_common.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_common.cuh
View file @
0166515c
#ifndef __INFINIOP_CUDA_COMMON_CUH__
#define __INFINIOP_CUDA_COMMON_CUH__
#include "cuda_handle.cuh"
#include "infinicore.h"
#include "nvidia_handle.cuh"
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
#ifdef ENABLE_CUDNN_API
cudnnDataType_t
getCudnnDtype
(
infiniDtype_t
dt
);
#endif
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
#endif // __INFINIOP_CUDA_COMMON_CUH__
src/infiniop/devices/
cuda/cud
a_handle.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_handle.cuh
View file @
0166515c
...
...
@@ -3,19 +3,24 @@
#include "../../../utils.h"
#include "../pool.h"
#include "
cud
a_handle.h"
#include "
nvidi
a_handle.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <functional>
#ifdef ENABLE_CUDNN_API
#include <cudnn.h>
#endif
#define CHECK_CUBLAS(API) CHECK_INTERNAL(API, CUBLAS_STATUS_SUCCESS)
#define CHECK_CUDNN(API) CHECK_INTERNAL(API, CUDNN_STATUS_SUCCESS)
namespace
device
::
cud
a
{
namespace
device
::
nvidi
a
{
class
Handle
::
Internal
{
Pool
<
cublasHandle_t
>
blas_handles
;
#ifdef ENABLE_CUDNN_API
Pool
<
cudnnHandle_t
>
dnn_handles
;
#endif
int
_warp_size
,
_max_threads_per_block
,
...
...
@@ -29,7 +34,9 @@ public:
Internal
(
int
);
infiniStatus_t
useCublas
(
cudaStream_t
stream
,
const
Fn
<
cublasHandle_t
>
&
f
)
const
;
#ifdef ENABLE_CUDNN_API
infiniStatus_t
useCudnn
(
cudaStream_t
stream
,
const
Fn
<
cudnnHandle_t
>
&
f
)
const
;
#endif
int
warpSize
()
const
;
int
maxThreadsPerBlock
()
const
;
...
...
@@ -41,6 +48,6 @@ public:
int
gridSizeZ
()
const
;
};
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
#endif // __INFINIOP_CUDA_HANDLE_CUH__
src/infiniop/devices/
cuda/cud
a_handle.h
→
src/infiniop/devices/
nvidia/nvidi
a_handle.h
View file @
0166515c
...
...
@@ -4,7 +4,9 @@
#include "../../handle.h"
#include <memory>
namespace
device
::
cuda
{
namespace
device
{
namespace
nvidia
{
struct
Handle
:
public
InfiniopHandle
{
class
Internal
;
...
...
@@ -13,21 +15,26 @@ struct Handle : public InfiniopHandle {
protected:
Handle
(
infiniDevice_t
device
,
int
device_id
);
public:
static
infiniStatus_t
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
);
private:
std
::
shared_ptr
<
Internal
>
_internal
;
};
namespace
nvidia
{
}
//
namespace nvidia
class
Handle
:
public
cuda
::
Handle
{
namespace
iluvatar
{
struct
Handle
:
public
nvidia
::
Handle
{
Handle
(
int
device_id
);
public:
static
infiniStatus_t
create
(
InfiniopHandle
**
handle_ptr
,
int
device_id
);
};
}
// namespace
nvidia
}
// namespace
iluvatar
}
// namespace device
::cuda
}
// namespace device
#endif // __INFINIOP_CUDA_HANDLE_H__
src/infiniop/devices/
cuda/cud
a_kernel_common.cuh
→
src/infiniop/devices/
nvidia/nvidi
a_kernel_common.cuh
View file @
0166515c
...
...
@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
...
...
@@ -12,8 +15,10 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace
device
::
cuda
{
using
cuda_bfloat16
=
nv_bfloat16
;
using
cuda_bfloat162
=
nv_bfloat162
;
namespace
device
::
nvidia
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
...
...
@@ -43,16 +48,14 @@ indexToOffset(
}
return
res
;
}
}
// namespace device::
cud
a
}
// namespace device::
nvidi
a
#ifdef ENABLE_CUDA_API
#include <cuda_fp16.h>
__forceinline__
__device__
float
exp_
(
const
float
val
)
{
return
expf
(
val
);
}
#ifndef ENABLE_ILUVATAR_
CUDA_
API
#ifndef ENABLE_ILUVATAR_API
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
expl
(
val
);
...
...
@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_
(
const
__nv_bfloat16
x
)
{
return
hexp
(
x
);
}
#endif
src/infiniop/elementwise/m
aca
/elementwise_m
aca
.h
→
src/infiniop/elementwise/m
etax
/elementwise_m
etax
.h
View file @
0166515c
#ifndef __INFINIOP_ELEMENTWISE_M
ACA
_H__
#define __INFINIOP_ELEMENTWISE_M
ACA
_H__
#ifndef __INFINIOP_ELEMENTWISE_M
ETAX
_H__
#define __INFINIOP_ELEMENTWISE_M
ETAX
_H__
#include "../../../utils.h"
#include "../../devices/m
aca/
common
_maca
.h"
#include "../../devices/m
aca/maca
_kernel_common.h"
#include "elementwise_m
aca
_api.h"
#include "../../devices/m
etax/metax_
common.h"
#include "../../devices/m
etax/metax
_kernel_common.h"
#include "elementwise_m
etax
_api.h"
namespace
op
::
elementwise
::
m
aca
{
namespace
op
::
elementwise
::
m
etax
{
template
<
typename
T
>
__device__
__forceinline__
const
T
*
typedInputPtr
(
const
void
*
ptr
)
{
return
reinterpret_cast
<
const
T
*>
(
ptr
);
...
...
@@ -14,7 +14,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
return
is_contiguous
?
idx
:
device
::
m
aca
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
return
is_contiguous
?
idx
:
device
::
m
etax
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
}
struct
InputIndexer
{
...
...
@@ -30,8 +30,8 @@ struct InputIndexer {
return
input_contiguous
[
input_id
]
?
idx
:
(
input_broadcasted
[
input_id
]
?
device
::
m
aca
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
m
aca
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
?
device
::
m
etax
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
m
etax
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
}
};
...
...
@@ -41,7 +41,7 @@ __device__ __forceinline__ void unpackInputsAndApply(F &&f, std::index_sequence<
}
template
<
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
INFINIOP_M
ACA
_KERNEL
elementwiseKernel
(
INFINIOP_M
ETAX
_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
...
...
@@ -72,7 +72,7 @@ INFINIOP_MACA_KERNEL elementwiseKernel(
}
template
<
typename
Op
,
typename
Tout
,
typename
...
Tin
>
INFINIOP_M
ACA
_KERNEL
elementwiseKernel
(
INFINIOP_M
ETAX
_KERNEL
elementwiseKernel
(
size_t
output_size
,
size_t
ndim
,
bool
output_contiguous
,
...
...
@@ -102,9 +102,9 @@ INFINIOP_MACA_KERNEL elementwiseKernel(
}
struct
DeviceImpl
::
Opaque
{
std
::
shared_ptr
<
device
::
m
aca
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
m
etax
::
Handle
::
Internal
>
internal
;
Opaque
(
const
std
::
shared_ptr
<
device
::
m
aca
::
Handle
::
Internal
>
&
internal
)
Opaque
(
const
std
::
shared_ptr
<
device
::
m
etax
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
template
<
uint32_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
...
...
@@ -159,8 +159,8 @@ private:
const
int8_t
*
d_meta_start
=
reinterpret_cast
<
int8_t
*>
(
workspace
)
+
input_arr_size
;
// copy the input pointer array and meta to device
CHECK_M
ACA
(
hcMemcpyAsync
(
workspace
,
h_inputs_arr
,
input_arr_size
,
hcMemcpyHostToDevice
,
stream
));
CHECK_M
ACA
(
hcMemcpyAsync
((
void
*
)
d_meta_start
,
info_meta_start
,
info
.
getMetaMemSize
(),
hcMemcpyHostToDevice
,
stream
));
CHECK_M
ETAX
(
hcMemcpyAsync
(
workspace
,
h_inputs_arr
,
input_arr_size
,
hcMemcpyHostToDevice
,
stream
));
CHECK_M
ETAX
(
hcMemcpyAsync
((
void
*
)
d_meta_start
,
info_meta_start
,
info
.
getMetaMemSize
(),
hcMemcpyHostToDevice
,
stream
));
// offset/assign the pointers
d_inputs_arr
=
reinterpret_cast
<
const
void
**>
(
workspace
);
...
...
@@ -259,6 +259,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
std
::
forward
<
Args
>
(
args
)...);
}
}
// namespace op::elementwise::m
aca
}
// namespace op::elementwise::m
etax
#endif
src/infiniop/elementwise/m
aca
/elementwise_m
aca
_api.h
→
src/infiniop/elementwise/m
etax
/elementwise_m
etax
_api.h
View file @
0166515c
#ifndef __INFINIOP_ELEMENTWISE_M
ACA
_API_H__
#define __INFINIOP_ELEMENTWISE_M
ACA
_API_H__
#ifndef __INFINIOP_ELEMENTWISE_M
ETAX
_API_H__
#define __INFINIOP_ELEMENTWISE_M
ETAX
_API_H__
#include "../elementwise.h"
namespace
op
::
elementwise
::
m
aca
{
namespace
op
::
elementwise
::
m
etax
{
class
DeviceImpl
final
{
struct
Opaque
;
...
...
@@ -37,15 +37,15 @@ public:
void
*
stream
,
Args
&&
...
args
);
};
}
// namespace op::elementwise::m
aca
#define CREATE_ELEMENTWISE_M
ACA
_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
}
// namespace op::elementwise::m
etax
#define CREATE_ELEMENTWISE_M
ETAX
_DESCRIPTOR(HANDLE, DTYPE, OUT_DESC, INPUT_DESC_VEC) \
\
auto info_result = op::elementwise::ElementwiseInfo::create(OUT_DESC, INPUT_DESC_VEC); \
CHECK_RESULT(info_result); \
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::m
aca
::DeviceImpl::create(HANDLE->internal()); \
auto device_impl_result = op::elementwise::m
etax
::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
...
...
@@ -56,4 +56,4 @@ public:
HANDLE->device, \
HANDLE->device_id);
#endif // __INFINIOP_ELEMENTWISE_M
ACA
_API_H__
#endif // __INFINIOP_ELEMENTWISE_M
ETAX
_API_H__
src/infiniop/elementwise/
cud
a/elementwise_
cud
a.cuh
→
src/infiniop/elementwise/
nvidi
a/elementwise_
nvidi
a.cuh
View file @
0166515c
...
...
@@ -2,11 +2,11 @@
#define __INFINIOP_ELEMENTWISE_CUDA_H__
#include "../../../utils.h"
#include "../../devices/
cuda/cud
a_common.cuh"
#include "../../devices/
cuda/cud
a_kernel_common.cuh"
#include "elementwise_
cud
a_api.cuh"
#include "../../devices/
nvidia/nvidi
a_common.cuh"
#include "../../devices/
nvidia/nvidi
a_kernel_common.cuh"
#include "elementwise_
nvidi
a_api.cuh"
namespace
op
::
elementwise
::
cud
a
{
namespace
op
::
elementwise
::
nvidi
a
{
/**
* @brief Casts an untyped device pointer to a typed pointer of type T.
...
...
@@ -33,7 +33,7 @@ __device__ __forceinline__ const T *typedInputPtr(const void *ptr) {
*/
__device__
__forceinline__
size_t
getOutputIndex
(
size_t
idx
,
bool
is_contiguous
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
return
is_contiguous
?
idx
:
device
::
cud
a
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
return
is_contiguous
?
idx
:
device
::
nvidi
a
::
indexToOffset
(
idx
,
ndim
,
shape
,
strides
);
}
/**
...
...
@@ -61,8 +61,8 @@ struct InputIndexer {
return
input_contiguous
[
input_id
]
?
idx
:
(
input_broadcasted
[
input_id
]
?
device
::
cud
a
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
cud
a
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
?
device
::
nvidi
a
::
indexToReducedOffset
(
idx
,
ndim
,
output_strides
,
input_strides
+
input_id
*
ndim
)
:
device
::
nvidi
a
::
indexToOffset
(
idx
,
ndim
,
input_shapes
+
input_id
*
ndim
,
input_strides
+
input_id
*
ndim
));
}
};
...
...
@@ -186,9 +186,9 @@ INFINIOP_CUDA_KERNEL elementwiseKernel(
}
struct
DeviceImpl
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
Opaque
(
const
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
&
internal
)
Opaque
(
const
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
/**
...
...
@@ -414,6 +414,6 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
std
::
forward
<
Args
>
(
args
)...);
}
}
// namespace op::elementwise::
cud
a
}
// namespace op::elementwise::
nvidi
a
#endif // __INFINIOP_ELEMENTWISE_CUDA_H__
src/infiniop/elementwise/
cud
a/elementwise_
cud
a_api.cuh
→
src/infiniop/elementwise/
nvidi
a/elementwise_
nvidi
a_api.cuh
View file @
0166515c
...
...
@@ -3,7 +3,7 @@
#include "../elementwise.h"
namespace
op
::
elementwise
::
cud
a
{
namespace
op
::
elementwise
::
nvidi
a
{
/**
* @brief Define the methods and info needed by CUDA to perform elementwise operation
...
...
@@ -77,7 +77,7 @@ public:
void
*
stream
,
Args
&&
...
args
);
};
}
// namespace op::elementwise::
cud
a
}
// namespace op::elementwise::
nvidi
a
/**
* @brief Define the process for initializing a Descriptor of an elementwise operation
...
...
@@ -95,7 +95,7 @@ public:
auto info = info_result.take(); \
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); \
\
auto device_impl_result = op::elementwise::
cud
a::DeviceImpl::create(HANDLE->internal()); \
auto device_impl_result = op::elementwise::
nvidi
a::DeviceImpl::create(HANDLE->internal()); \
CHECK_RESULT(device_impl_result); \
\
*desc_ptr = new Descriptor( \
...
...
src/infiniop/ninetoothed/build.py
0 → 100644
View file @
0166515c
import
functools
import
inspect
import
itertools
import
pathlib
import
ninetoothed
from
ninetoothed.aot
import
_HEADER_PATH
CURRENT_FILE_PATH
=
pathlib
.
Path
(
__file__
)
BUILD_DIRECTORY_PATH
=
(
CURRENT_FILE_PATH
.
parent
.
parent
.
parent
.
parent
/
"build"
/
"ninetoothed"
)
def
build
(
premake
,
constexpr_param_grid
,
caller
,
op_name
,
output_dir
):
headers
=
[]
all_param_names
=
[]
launches
=
[]
for
combination
in
_generate_param_value_combinations
(
constexpr_param_grid
):
arrangement
,
application
,
tensors
=
premake
(
**
combination
)
for
param_name
,
param_value
in
combination
.
items
():
if
isinstance
(
param_value
,
str
):
combination
[
param_name
]
=
(
f
"INFINI_DTYPE_
{
combination
[
param_name
].
replace
(
'fp'
,
'F'
).
upper
()
}
"
)
combination
=
{
f
"
{
name
}
_"
:
value
for
name
,
value
in
combination
.
items
()}
kernel_name
=
f
"
{
op_name
}
_
{
_generate_suffix
(
combination
.
values
())
}
"
ninetoothed
.
make
(
arrangement
,
application
,
tensors
,
caller
=
caller
,
kernel_name
=
kernel_name
,
output_dir
=
output_dir
,
)
header
=
output_dir
/
f
"
{
kernel_name
}
.h"
param_names
=
(
"stream"
,)
+
tuple
(
inspect
.
signature
(
application
).
parameters
.
keys
()
)
launch
=
f
""" if (
{
_generate_condition
(
combination
)
}
)
return launch_
{
kernel_name
}
(
{
", "
.
join
(
param_names
)
}
);"""
headers
.
append
(
header
)
all_param_names
.
append
(
param_names
)
launches
.
append
(
launch
)
includes
=
"
\n
"
.
join
(
f
'#include "
{
header
}
"'
for
header
in
headers
)
param_names
=
list
(
functools
.
reduce
(
lambda
x
,
y
:
dict
.
fromkeys
(
x
)
|
dict
.
fromkeys
(
y
),
sorted
(
all_param_names
,
key
=
len
,
reverse
=
True
),
{},
)
)
param_types
=
[
"NineToothedStream"
,
]
+
[
"NineToothedTensor"
for
_
in
range
(
len
(
param_names
)
-
1
)]
for
param_name
in
combination
:
param_names
.
append
(
param_name
)
param_types
.
append
(
"int"
)
param_decls
=
", "
.
join
(
f
"
{
type
}
{
param
}
"
for
param
,
type
in
zip
(
param_names
,
param_types
)
)
source_file_name
=
f
"
{
op_name
}
.c"
header_file_name
=
f
"
{
op_name
}
.h"
func_sig
=
f
"NineToothedResult launch_
{
op_name
}
(
{
param_decls
}
)"
joined_launches
=
"
\n
"
.
join
(
launches
)
op_decl
=
f
'#ifdef __cplusplus
\n
extern "C"
{
func_sig
}
;
\n
#else
\n
{
func_sig
}
;
\n
#endif'
op_def
=
f
"""
{
func_sig
}
{{
{
joined_launches
}
return INFINI_STATUS_NOT_IMPLEMENTED;
}}"""
source_content
=
f
"""#include "
{
header_file_name
}
"
#include "infinicore.h"
{
includes
}
\n\n
{
op_def
}
\n
"""
header_content
=
f
"""#include "
{
_HEADER_PATH
}
"
\n
{
op_decl
}
\n
"""
(
BUILD_DIRECTORY_PATH
/
source_file_name
).
write_text
(
source_content
)
(
BUILD_DIRECTORY_PATH
/
header_file_name
).
write_text
(
header_content
)
def
_generate_condition
(
combination
):
return
" && "
.
join
(
f
"
{
param
}
==
{
value
}
"
for
param
,
value
in
combination
.
items
())
def
_generate_suffix
(
values
):
return
"_"
.
join
(
f
"
{
value
}
"
for
value
in
values
)
def
_generate_param_value_combinations
(
param_grid
):
keys
=
list
(
param_grid
.
keys
())
value_combinations
=
itertools
.
product
(
*
param_grid
.
values
())
return
tuple
(
dict
(
zip
(
keys
,
combination
))
for
combination
in
value_combinations
)
src/infiniop/ops/add/cpu/add_cpu.cc
View file @
0166515c
...
...
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
...
...
@@ -43,6 +43,8 @@ infiniStatus_t Descriptor::calculate(
return
_device_info
->
calculate
<
AddOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
AddOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
AddOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
...
...
src/infiniop/ops/add/cuda/
add_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/add/cuda/
k
ern
e
l.cuh
View file @
0166515c
#ifndef __ADD_CUDA_H__
#define __ADD_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
add
::
cuda
{
typedef
struct
AddOp
{
public:
...
...
@@ -12,7 +9,7 @@ public:
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hadd2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
return
__hadd
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fadd_rd
(
a
,
b
);
...
...
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment