Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
802a75d3
Commit
802a75d3
authored
May 07, 2025
by
crapromer
Browse files
fix conflicts of operator.cc in swiglu
parent
7d2acaf7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
31 additions
and
39 deletions
+31
-39
src/infiniop/devices/maca/maca_kernel_common.h
src/infiniop/devices/maca/maca_kernel_common.h
+0
-5
src/infiniop/elementwise/maca/elementwise_maca.h
src/infiniop/elementwise/maca/elementwise_maca.h
+7
-7
src/infiniop/elementwise/maca/elementwise_maca_api.h
src/infiniop/elementwise/maca/elementwise_maca_api.h
+2
-2
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+9
-4
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+13
-21
No files found.
src/infiniop/devices/maca/maca_kernel_common.h
View file @
802a75d3
#ifdef ENABLE_SUGON_MACA_API
#define INFINIOP_MACA_KERNEL __launch_bounds__(512) __global__ void
#else
#define INFINIOP_MACA_KERNEL __global__ void
#endif
// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
...
...
src/infiniop/elementwise/maca/elementwise_maca.h
View file @
802a75d3
...
...
@@ -107,7 +107,7 @@ struct DeviceImpl::Opaque {
Opaque
(
const
std
::
shared_ptr
<
device
::
maca
::
Handle
::
Internal
>
&
internal
)
:
internal
(
internal
)
{}
template
<
size
_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
template
<
uint32
_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
...
...
@@ -122,7 +122,7 @@ struct DeviceImpl::Opaque {
std
::
forward
<
Args
>
(
args
)...);
}
template
<
size
_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
template
<
uint32
_t
BLOCK_SIZE
,
size_t
N
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculateImpl
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
...
...
@@ -174,7 +174,7 @@ private:
return
INFINI_STATUS_SUCCESS
;
}
template
<
size
_t
BLOCK_SIZE
,
size_t
N
,
typename
KernelFunc
,
typename
Tout
,
typename
...
Args
>
template
<
uint32
_t
BLOCK_SIZE
,
size_t
N
,
typename
KernelFunc
,
typename
Tout
,
typename
...
Args
>
infiniStatus_t
launchElementwiseKernel
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
...
...
@@ -203,8 +203,8 @@ private:
d_output_shape
,
d_output_strides
,
d_input_shapes
,
d_input_strides
,
stream
));
dim3
blockDims
(
std
::
min
(
BLOCK_SIZE
,
static_cast
<
size
_t
>
(
internal
->
maxThreadsPerBlock
())));
dim3
gridDims
(
std
::
min
(
CEIL_DIV
(
output_size
,
blockDims
.
x
),
static_cast
<
size
_t
>
(
internal
->
gridSizeX
())));
dim3
blockDims
(
std
::
min
(
BLOCK_SIZE
,
static_cast
<
uint32
_t
>
(
internal
->
maxThreadsPerBlock
())));
dim3
gridDims
(
std
::
min
(
uint32_t
(
CEIL_DIV
(
output_size
,
blockDims
.
x
)
)
,
static_cast
<
uint32
_t
>
(
internal
->
gridSizeX
())));
size_t
step
=
gridDims
.
x
*
blockDims
.
x
;
for
(
size_t
i
=
0
;
i
<
output_size
;
i
+=
step
)
{
...
...
@@ -228,7 +228,7 @@ utils::Result<DeviceImpl *> DeviceImpl::create(Args &&...args) {
}
/* Invoke elementwise operation for different input types */
template
<
u
nsigned
in
t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
template
<
u
int32_
t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
...
...
@@ -245,7 +245,7 @@ infiniStatus_t DeviceImpl::calculate(const op::elementwise::ElementwiseInfo &inf
}
/* Invoke elementwise operation when all inputs have the same dtype */
template
<
u
nsigned
in
t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
template
<
u
int32_
t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
DeviceImpl
::
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
void
*
output
,
...
...
src/infiniop/elementwise/maca/elementwise_maca_api.h
View file @
802a75d3
...
...
@@ -17,7 +17,7 @@ public:
template
<
typename
...
Args
>
static
utils
::
Result
<
DeviceImpl
*>
create
(
Args
&&
...
args
);
template
<
u
nsigned
in
t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
template
<
u
int32_
t
BLOCK_SIZE
,
typename
Op
,
typename
Tdata
,
typename
...
Args
>
infiniStatus_t
calculate
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
workspace
,
...
...
@@ -26,7 +26,7 @@ public:
void
*
stream
,
Args
&&
...
args
);
template
<
u
nsigned
in
t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
template
<
u
int32_
t
BLOCK_SIZE
,
typename
Op
,
typename
Tout
,
typename
...
Tin
,
typename
...
Args
,
std
::
enable_if_t
<
(
sizeof
...(
Tin
)
==
Op
::
num_inputs
),
int
>
=
0
>
infiniStatus_t
calculate
(
...
...
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
View file @
802a75d3
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace
op
::
swiglu
::
maca
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
// if constexpr (std::is_same_v<T, half2>) {
// return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
// } else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rd
(
__fadd_rd
(
1
,
__expf
(
-
x
)));
...
...
@@ -33,3 +36,5 @@ public:
}
}
SwiGLUOp
;
}
// namespace op::swiglu::maca
#endif
src/infiniop/ops/swiglu/operator.cc
View file @
802a75d3
...
...
@@ -42,13 +42,11 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -92,18 +90,16 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
=======
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
GET
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
}
#endif
...
...
@@ -149,13 +145,11 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -197,13 +191,11 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
<
<
<
<
<
<
<
HEAD
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
=======
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
>>>>>>>
f3a0177
(
Migrate
elementwise
base
from
cuda
to
maca
,
and
implement
swiglu
with
test
pass
)
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment