Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
3144cc9c
Commit
3144cc9c
authored
Feb 25, 2025
by
YdrMaster
Browse files
issue/63/style: 尽量将 cv 修饰符移动到类型前
Signed-off-by:
YdrMaster
<
ydrml@hotmail.com
>
parent
b461d520
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
117 additions
and
103 deletions
+117
-103
include/infiniop/operator.h
include/infiniop/operator.h
+2
-2
include/infiniop/ops/attention.h
include/infiniop/ops/attention.h
+3
-3
include/infiniop/ops/mlp.h
include/infiniop/ops/mlp.h
+3
-3
include/infiniop/tensor_descriptor.h
include/infiniop/tensor_descriptor.h
+1
-1
src/infiniop/devices/cpu/common_cpu.cc
src/infiniop/devices/cpu/common_cpu.cc
+18
-8
src/infiniop/devices/cpu/common_cpu.h
src/infiniop/devices/cpu/common_cpu.h
+6
-6
src/infiniop/devices/cuda/common_cuda.cuh
src/infiniop/devices/cuda/common_cuda.cuh
+13
-9
src/infiniop/operator.cc
src/infiniop/operator.cc
+2
-2
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+13
-13
src/infiniop/ops/matmul/bang/matmul_bang.cc
src/infiniop/ops/matmul/bang/matmul_bang.cc
+7
-7
src/infiniop/ops/matmul/blas.h
src/infiniop/ops/matmul/blas.h
+1
-1
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+15
-15
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+20
-20
src/infiniop/ops/matmul/matmul.h
src/infiniop/ops/matmul/matmul.h
+2
-2
src/infiniop/ops/matmul/operator.cc
src/infiniop/ops/matmul/operator.cc
+5
-5
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+1
-1
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+1
-1
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+1
-1
src/infiniop/ops/rotary_embedding/operator.cc
src/infiniop/ops/rotary_embedding/operator.cc
+2
-2
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+1
-1
No files found.
include/infiniop/operator.h
View file @
3144cc9c
...
...
@@ -10,7 +10,7 @@ typedef struct InfiniopDescriptor {
int
device_id
;
}
InfiniopDescriptor
;
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceType
(
InfiniopDescriptor
const
*
desc_ptr
,
infiniDevice_t
*
device_type
);
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceId
(
InfiniopDescriptor
const
*
desc_ptr
,
int
*
device_id
);
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceType
(
const
InfiniopDescriptor
*
desc_ptr
,
infiniDevice_t
*
device_type
);
__C
__export
infiniopStatus_t
infiniopGetDescriptorDeviceId
(
const
InfiniopDescriptor
*
desc_ptr
,
int
*
device_id
);
#endif //__INFINIOP_OPERATOR___
include/infiniop/ops/attention.h
View file @
3144cc9c
...
...
@@ -23,9 +23,9 @@ __C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t de
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
void
const
*
q
,
void
const
*
k
,
void
const
*
v
,
const
void
*
q
,
const
void
*
k
,
const
void
*
v
,
void
*
k_cache
,
void
*
v_cache
,
void
*
stream
);
...
...
include/infiniop/ops/mlp.h
View file @
3144cc9c
...
...
@@ -22,9 +22,9 @@ __C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
const
*
x
,
void
const
*
w12
,
void
const
*
w3
,
const
void
*
x
,
const
void
*
w12
,
const
void
*
w3
,
void
*
stream
);
__C
__export
infiniopStatus_t
infiniopDestroyMLPDescriptor
(
infiniopMLPDescriptor_t
desc
);
...
...
include/infiniop/tensor_descriptor.h
View file @
3144cc9c
...
...
@@ -17,7 +17,7 @@ struct InfiniopTensorDescriptor {
typedef
struct
InfiniopTensorDescriptor
*
infiniopTensorDescriptor_t
;
__C
__export
infiniopStatus_t
infiniopCreateTensorDescriptor
(
infiniopTensorDescriptor_t
*
desc_ptr
,
size_t
ndim
,
size_t
cons
t
*
shape
,
ptrdiff_t
const
*
strides
,
infiniDtype_t
dtype
);
__C
__export
infiniopStatus_t
infiniopCreateTensorDescriptor
(
infiniopTensorDescriptor_t
*
desc_ptr
,
size_t
ndim
,
const
size_
t
*
shape
,
const
ptrdiff_t
*
strides
,
infiniDtype_t
dtype
);
__C
__export
infiniopStatus_t
infiniopDestroyTensorDescriptor
(
infiniopTensorDescriptor_t
desc
);
...
...
src/infiniop/devices/cpu/common_cpu.cc
View file @
3144cc9c
...
...
@@ -59,9 +59,11 @@ uint16_t f32_to_f16(float val) {
}
}
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
)
{
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
...
...
@@ -70,8 +72,11 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim,
return
res
;
}
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
const
*
shape
,
ptrdiff_t
const
*
strides
)
{
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>=
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
...
...
@@ -80,7 +85,10 @@ size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
return
res
;
}
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
size_t
const
*
pads
)
{
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
const
size_t
*
pads
)
{
size_t
total_size
=
1
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
total_size
*=
shape
[
i
]
+
(
i
<
2
?
0
:
2
*
pads
[
i
-
2
]);
...
...
@@ -88,8 +96,10 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
return
total_size
;
}
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
size_t
const
*
shape
,
size_t
const
*
pads
)
{
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_t
*
pads
)
{
std
::
vector
<
size_t
>
padded_shape
(
ndim
);
memcpy
(
padded_shape
.
data
(),
shape
,
ndim
*
sizeof
(
size_t
));
for
(
size_t
i
=
2
;
i
<
ndim
;
++
i
)
{
...
...
src/infiniop/devices/cpu/common_cpu.h
View file @
3144cc9c
#ifndef __INFINIOP_
_
COMMON_CPU_H__
#define __INFINIOP_
_
COMMON_CPU_H__
#ifndef __INFINIOP_COMMON_CPU_H__
#define __INFINIOP_COMMON_CPU_H__
#include <cmath>
#include <cstddef>
...
...
@@ -14,18 +14,18 @@ float f16_to_f32(uint16_t code);
uint16_t
f32_to_f16
(
float
val
);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
);
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
);
// return the memory offset a tensor given flattened index
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
cons
t
*
shape
,
ptrdiff_t
const
*
strides
);
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_
t
*
shape
,
const
ptrdiff_t
*
strides
);
/**
* get the total array size (element count) after applying padding for a
* ndim-ary tensor with the given shape
*/
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
size_t
cons
t
*
pads
);
size_t
getPaddedSize
(
size_t
ndim
,
size_t
*
shape
,
const
size_
t
*
pads
);
// calculate the padded shape and store the result in padded_shape
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
size_t
const
*
shape
,
size_t
cons
t
*
pads
);
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_
t
*
pads
);
#endif // __INFINIOP__COMMON_CPU_H__
src/infiniop/devices/cuda/common_cuda.cuh
View file @
3144cc9c
...
...
@@ -47,8 +47,8 @@ struct InfiniopCudaHandle {
int
compute_capability_minor
;
};
template
<
class
T
>
void
use_cublas
(
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
&
pool
,
cudaStream_t
stream
,
T
const
&
f
)
{
template
<
typename
T
>
void
use_cublas
(
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
&
pool
,
cudaStream_t
stream
,
const
T
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cublasCreate
(
&
(
*
handle
));
...
...
@@ -58,8 +58,8 @@ void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &pool, cudaStream_t stream
pool
->
push
(
std
::
move
(
*
handle
));
}
template
<
class
T
>
void
use_cudnn
(
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
&
pool
,
cudaStream_t
stream
,
T
const
&
f
)
{
template
<
typename
T
>
void
use_cudnn
(
std
::
shared_ptr
<
Pool
<
cudnnHandle_t
>>
&
pool
,
cudaStream_t
stream
,
const
T
&
f
)
{
auto
handle
=
pool
->
pop
();
if
(
!
handle
)
{
cudnnCreate
(
&
(
*
handle
));
...
...
@@ -95,8 +95,10 @@ inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
ptrdiff_t
const
*
broadcasted_strides
,
ptrdiff_t
const
*
target_strides
)
{
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
...
...
@@ -106,9 +108,11 @@ inline __device__ __host__ size_t indexToReducedOffset(
}
// get the memory offset of the given element in a tensor given its flat index
inline
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
size_t
const
*
shape
,
ptrdiff_t
const
*
strides
)
{
inline
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
...
...
src/infiniop/operator.cc
View file @
3144cc9c
#include "infiniop/operator.h"
infiniopStatus_t
infiniopGetDescriptorDeviceType
(
InfiniopDescriptor
const
*
desc_ptr
,
const
InfiniopDescriptor
*
desc_ptr
,
infiniDevice_t
*
device_type
)
{
*
device_type
=
desc_ptr
->
device_type
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
infiniopGetDescriptorDeviceId
(
InfiniopDescriptor
const
*
desc_ptr
,
const
InfiniopDescriptor
*
desc_ptr
,
int
*
device_id
)
{
*
device_id
=
desc_ptr
->
device_id
;
return
INFINIOP_STATUS_SUCCESS
;
...
...
src/infiniop/ops/matmul/ascend/matmul_ascend.cc
View file @
3144cc9c
...
...
@@ -43,7 +43,7 @@ infiniopStatus_t Descriptor::create(
}
infiniopStatus_t
status
;
auto
_
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
ROW_MAJOR
);
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
ROW_MAJOR
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
...
...
@@ -56,21 +56,21 @@ infiniopStatus_t Descriptor::create(
// operation
CHECK_STATUS
(
c
->
setDescriptor
(
toAclDataType
(
c_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
_
info
.
c_matrix
.
rows
),
static_cast
<
int64_t
>
(
_
info
.
c_matrix
.
cols
)},
{
_
info
.
c_matrix
.
row_stride
,
_
info
.
c_matrix
.
col_stride
}),
{
static_cast
<
int64_t
>
(
info
.
c_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
c_matrix
.
cols
)},
{
info
.
c_matrix
.
row_stride
,
info
.
c_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
a
->
setDescriptor
(
toAclDataType
(
a_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
_
info
.
a_matrix
.
rows
),
static_cast
<
int64_t
>
(
_
info
.
a_matrix
.
cols
)},
{
_
info
.
a_matrix
.
row_stride
,
_
info
.
a_matrix
.
col_stride
}),
{
static_cast
<
int64_t
>
(
info
.
a_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
a_matrix
.
cols
)},
{
info
.
a_matrix
.
row_stride
,
info
.
a_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
b
->
setDescriptor
(
toAclDataType
(
b_desc
->
dtype
),
{
static_cast
<
int64_t
>
(
_
info
.
b_matrix
.
rows
),
static_cast
<
int64_t
>
(
_
info
.
b_matrix
.
cols
)},
{
_
info
.
b_matrix
.
row_stride
,
_
info
.
b_matrix
.
col_stride
}),
{
static_cast
<
int64_t
>
(
info
.
b_matrix
.
rows
),
static_cast
<
int64_t
>
(
info
.
b_matrix
.
cols
)},
{
info
.
b_matrix
.
row_stride
,
info
.
b_matrix
.
col_stride
}),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
c
->
createTensor
(),
INFINIOP_STATUS_SUCCESS
);
...
...
@@ -95,7 +95,7 @@ infiniopStatus_t Descriptor::create(
aclSetAclOpExecutorRepeatable
(
executor
);
*
desc_ptr
=
new
Descriptor
(
dtype
,
_
info
,
workspace_size
,
dtype
,
info
,
workspace_size
,
new
Opaque
{
executor
,
c
,
...
...
@@ -112,8 +112,8 @@ infiniopStatus_t Descriptor::calculate(
size_t
workspaceSize_
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
void
*
stream
)
const
{
...
...
src/infiniop/ops/matmul/bang/matmul_bang.cc
View file @
3144cc9c
...
...
@@ -73,7 +73,7 @@ infiniopStatus_t Descriptor::create(
}
infiniopStatus_t
status
;
auto
_
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
ROW_MAJOR
);
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
ROW_MAJOR
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
...
...
@@ -83,9 +83,9 @@ infiniopStatus_t Descriptor::create(
cnnlCreateTensorDescriptor
(
&
b
);
cnnlCreateTensorDescriptor
(
&
c
);
setMatrixTensorEx
(
a
,
_
info
.
a_matrix
,
a_desc
->
dtype
);
setMatrixTensorEx
(
b
,
_
info
.
b_matrix
,
b_desc
->
dtype
);
setMatrixTensorEx
(
c
,
_
info
.
c_matrix
,
c_desc
->
dtype
);
setMatrixTensorEx
(
a
,
info
.
a_matrix
,
a_desc
->
dtype
);
setMatrixTensorEx
(
b
,
info
.
b_matrix
,
b_desc
->
dtype
);
setMatrixTensorEx
(
c
,
info
.
c_matrix
,
c_desc
->
dtype
);
cnnlMatMulDescriptor_t
op
;
cnnlMatMulAlgo_t
algo
;
...
...
@@ -112,7 +112,7 @@ infiniopStatus_t Descriptor::create(
cnnlGetBatchMatMulHeuristicResult
(
algoResult
,
algo
,
&
workspace_size
);
*
desc_ptr
=
new
Descriptor
(
dtype
,
_
info
,
workspace_size
,
dtype
,
info
,
workspace_size
,
new
Opaque
{
op
,
algo
,
...
...
@@ -130,8 +130,8 @@ infiniopStatus_t Descriptor::calculate(
size_t
workspace_size
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
void
*
stream
)
const
{
...
...
src/infiniop/ops/matmul/blas.h
View file @
3144cc9c
...
...
@@ -60,7 +60,7 @@ struct BlasMatrix {
}
};
enum
class
MatrixLayout
:
uint8_t
{
enum
class
MatrixLayout
:
char
{
COL_MAJOR
,
ROW_MAJOR
,
};
...
...
src/infiniop/ops/matmul/cpu/matmul_cpu.cc
View file @
3144cc9c
...
...
@@ -20,13 +20,13 @@ infiniopStatus_t Descriptor::create(
}
infiniopStatus_t
status
;
auto
_
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
COL_MAJOR
);
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
COL_MAJOR
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
*
desc_ptr
=
new
Descriptor
(
dtype
,
_
info
,
0
,
dtype
,
info
,
0
,
nullptr
,
handle
->
device
,
handle
->
device_id
);
return
INFINIOP_STATUS_SUCCESS
;
...
...
@@ -34,24 +34,24 @@ infiniopStatus_t Descriptor::create(
template
<
typename
Tdata
>
void
calculate
(
MatmulInfo
const
&
_
info
,
const
MatmulInfo
&
info
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
)
{
if
(
_
info
.
is_transed
)
{
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
for
(
size_t
i
=
0
;
i
<
_
info
.
batch
;
++
i
)
{
for
(
size_t
m_
=
0
;
m_
<
_
info
.
m
;
++
m_
)
{
for
(
size_t
n_
=
0
;
n_
<
_
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
_
info
.
c_matrix
.
stride
+
m_
*
_
info
.
c_matrix
.
row_stride
+
n_
*
_
info
.
c_matrix
.
col_stride
;
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
++
i
)
{
for
(
size_t
m_
=
0
;
m_
<
info
.
m
;
++
m_
)
{
for
(
size_t
n_
=
0
;
n_
<
info
.
n
;
++
n_
)
{
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
)
+
i
*
info
.
c_matrix
.
stride
+
m_
*
info
.
c_matrix
.
row_stride
+
n_
*
info
.
c_matrix
.
col_stride
;
float
sum
=
0
;
for
(
size_t
k_
=
0
;
k_
<
_
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
Tdata
const
*>
(
a
)
+
i
*
_
info
.
a_matrix
.
stride
+
m_
*
_
info
.
a_matrix
.
row_stride
+
k_
*
_
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
Tdata
const
*>
(
b
)
+
i
*
_
info
.
b_matrix
.
stride
+
n_
*
_
info
.
b_matrix
.
col_stride
+
k_
*
_
info
.
b_matrix
.
row_stride
;
for
(
size_t
k_
=
0
;
k_
<
info
.
k
;
++
k_
)
{
auto
a_
=
reinterpret_cast
<
const
Tdata
*>
(
a
)
+
i
*
info
.
a_matrix
.
stride
+
m_
*
info
.
a_matrix
.
row_stride
+
k_
*
info
.
a_matrix
.
col_stride
;
auto
b_
=
reinterpret_cast
<
const
Tdata
*>
(
b
)
+
i
*
info
.
b_matrix
.
stride
+
n_
*
info
.
b_matrix
.
col_stride
+
k_
*
info
.
b_matrix
.
row_stride
;
if
constexpr
(
std
::
is_same
<
Tdata
,
uint16_t
>::
value
)
{
sum
+=
f16_to_f32
(
*
a_
)
*
f16_to_f32
(
*
b_
);
}
else
{
...
...
@@ -77,8 +77,8 @@ infiniopStatus_t Descriptor::calculate(
size_t
workspace_size
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
void
*
stream
)
const
{
...
...
src/infiniop/ops/matmul/cuda/matmul_cuda.cu
View file @
3144cc9c
...
...
@@ -26,13 +26,13 @@ infiniopStatus_t Descriptor::create(
}
infiniopStatus_t
status
;
auto
_
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
COL_MAJOR
);
auto
info
=
MatmulInfo
(
c_desc
,
a_desc
,
b_desc
,
&
status
,
MatrixLayout
::
COL_MAJOR
);
if
(
status
!=
INFINIOP_STATUS_SUCCESS
)
{
return
status
;
}
*
desc_ptr
=
new
Descriptor
(
dtype
,
_
info
,
0
,
dtype
,
info
,
0
,
new
Opaque
{
handle
->
cublas_handle_pool
},
handle
->
device
,
handle
->
device_id
);
return
INFINIOP_STATUS_SUCCESS
;
...
...
@@ -40,16 +40,16 @@ infiniopStatus_t Descriptor::create(
template
<
typename
Tdata
>
void
calculate
(
MatmulInfo
const
&
_
info
,
const
MatmulInfo
&
info
,
std
::
shared_ptr
<
Pool
<
cublasHandle_t
>>
&
cublas_handle_pool
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
cudaStream_t
stream
)
{
if
(
_
info
.
is_transed
)
{
if
(
info
.
is_transed
)
{
std
::
swap
(
a
,
b
);
}
...
...
@@ -67,8 +67,8 @@ void calculate(
#endif
}
auto
op_a
=
_
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
_
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_a
=
info
.
a_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
auto
op_b
=
info
.
b_matrix
.
row_stride
==
1
?
CUBLAS_OP_N
:
CUBLAS_OP_T
;
use_cublas
(
cublas_handle_pool
,
stream
,
...
...
@@ -77,24 +77,24 @@ void calculate(
handle
,
op_a
,
op_b
,
static_cast
<
int
>
(
_
info
.
m
),
static_cast
<
int
>
(
_
info
.
n
),
static_cast
<
int
>
(
_
info
.
k
),
static_cast
<
int
>
(
info
.
m
),
static_cast
<
int
>
(
info
.
n
),
static_cast
<
int
>
(
info
.
k
),
&
alpha
,
a
,
a_type
,
static_cast
<
int
>
(
_
info
.
a_matrix
.
ld
()),
_
info
.
a_matrix
.
stride
,
static_cast
<
int
>
(
info
.
a_matrix
.
ld
()),
info
.
a_matrix
.
stride
,
b
,
b_type
,
static_cast
<
int
>
(
_
info
.
b_matrix
.
ld
()),
_
info
.
b_matrix
.
stride
,
static_cast
<
int
>
(
info
.
b_matrix
.
ld
()),
info
.
b_matrix
.
stride
,
&
beta
,
c
,
c_type
,
static_cast
<
int
>
(
_
info
.
c_matrix
.
ld
()),
_
info
.
c_matrix
.
stride
,
static_cast
<
int
>
(
_
info
.
batch
),
static_cast
<
int
>
(
info
.
c_matrix
.
ld
()),
info
.
c_matrix
.
stride
,
static_cast
<
int
>
(
info
.
batch
),
compute_type
,
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
});
...
...
@@ -105,8 +105,8 @@ infiniopStatus_t Descriptor::calculate(
size_t
workspace_size
,
void
*
c
,
float
beta
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
void
*
stream
)
const
{
...
...
src/infiniop/ops/matmul/matmul.h
View file @
3144cc9c
...
...
@@ -83,8 +83,8 @@
size_t workspace_size, \
void *c, \
float beta, \
void
const *a, \
void
const *b, \
const
void
*a, \
const
void
*b, \
float alpha, \
void *stream) const; \
}; \
...
...
src/infiniop/ops/matmul/operator.cc
View file @
3144cc9c
...
...
@@ -58,7 +58,7 @@ infiniopGetMatmulWorkspaceSize(
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<matmul::NAMESPACE::Descriptor
const
*>(desc)->workspace_size; \
*size = reinterpret_cast<
const
matmul::NAMESPACE::Descriptor *>(desc)->workspace_size; \
return INFINIOP_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
...
...
@@ -87,15 +87,15 @@ __C infiniopStatus_t infiniopMatmul(
infiniopMatmulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
float
alpha
,
float
beta
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<matmul::NAMESPACE::Descriptor
const
*>(desc) \
return reinterpret_cast<
const
matmul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \
c, beta, \
a, b, alpha, \
...
...
@@ -128,7 +128,7 @@ infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<matmul::NAMESPACE::Descriptor
const
*>(desc); \
delete reinterpret_cast<
const
matmul::NAMESPACE::Descriptor *>(desc); \
return INFINIOP_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
...
...
src/infiniop/ops/random_sample/operator.cc
View file @
3144cc9c
...
...
@@ -79,7 +79,7 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
...
...
src/infiniop/ops/rearrange/operator.cc
View file @
3144cc9c
...
...
@@ -43,7 +43,7 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
return
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniopStatus_t
infiniopRearrange
(
infiniopRearrangeDescriptor_t
desc
,
void
*
dst
,
void
const
*
src
,
void
*
stream
)
{
__C
infiniopStatus_t
infiniopRearrange
(
infiniopRearrangeDescriptor_t
desc
,
void
*
dst
,
const
void
*
src
,
void
*
stream
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
...
...
src/infiniop/ops/rms_norm/operator.cc
View file @
3144cc9c
...
...
@@ -84,7 +84,7 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
}
__C
infiniopStatus_t
infiniopRMSNorm
(
infiniopRMSNormDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
void
const
*
x
,
void
const
*
w
,
void
*
stream
)
{
void
*
y
,
const
void
*
x
,
const
void
*
w
,
void
*
stream
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
...
...
src/infiniop/ops/rotary_embedding/operator.cc
View file @
3144cc9c
...
...
@@ -91,8 +91,8 @@ __C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
__C
infiniopStatus_t
infiniopRoPE
(
infiniopRoPEDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
t
,
void
const
*
pos_ids
,
void
const
*
sin_table
,
void
const
*
cos_table
,
void
*
t
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
...
...
src/infiniop/ops/swiglu/operator.cc
View file @
3144cc9c
...
...
@@ -46,7 +46,7 @@ __C infiniopStatus_t infiniopCreateSwiGLUDescriptor(
};
__C
infiniopStatus_t
infiniopSwiGLU
(
infiniopSwiGLUDescriptor_t
desc
,
void
*
c
,
void
const
*
a
,
void
const
*
b
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment