Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
0166515c
Unverified
Commit
0166515c
authored
Aug 07, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 07, 2025
Browse files
Merge branch 'main' into issue/300
parents
f0300ff3
a23c4d13
Changes
175
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
345 additions
and
116 deletions
+345
-116
src/infiniop/ops/random_sample/metax/random_sample_kernel.h
src/infiniop/ops/random_sample/metax/random_sample_kernel.h
+13
-8
src/infiniop/ops/random_sample/metax/random_sample_metax.h
src/infiniop/ops/random_sample/metax/random_sample_metax.h
+8
-0
src/infiniop/ops/random_sample/metax/random_sample_metax.maca
...infiniop/ops/random_sample/metax/random_sample_metax.maca
+17
-16
src/infiniop/ops/random_sample/nvidia/random_sample_kernel.cuh
...nfiniop/ops/random_sample/nvidia/random_sample_kernel.cuh
+8
-8
src/infiniop/ops/random_sample/nvidia/random_sample_nvidia.cu
...infiniop/ops/random_sample/nvidia/random_sample_nvidia.cu
+6
-6
src/infiniop/ops/random_sample/nvidia/random_sample_nvidia.cuh
...nfiniop/ops/random_sample/nvidia/random_sample_nvidia.cuh
+1
-1
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+27
-15
src/infiniop/ops/rearrange/maca/rearrange_maca.h
src/infiniop/ops/rearrange/maca/rearrange_maca.h
+0
-8
src/infiniop/ops/rearrange/metax/rearrange_kernel.h
src/infiniop/ops/rearrange/metax/rearrange_kernel.h
+5
-5
src/infiniop/ops/rearrange/metax/rearrange_metax.h
src/infiniop/ops/rearrange/metax/rearrange_metax.h
+8
-0
src/infiniop/ops/rearrange/metax/rearrange_metax.maca
src/infiniop/ops/rearrange/metax/rearrange_metax.maca
+14
-14
src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
+10
-10
src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu
src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cu
+10
-10
src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cuh
src/infiniop/ops/rearrange/nvidia/rearrange_nvidia.cuh
+1
-1
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+21
-14
src/infiniop/ops/relu/cpu/relu_cpu.cc
src/infiniop/ops/relu/cpu/relu_cpu.cc
+52
-0
src/infiniop/ops/relu/cpu/relu_cpu.h
src/infiniop/ops/relu/cpu/relu_cpu.h
+22
-0
src/infiniop/ops/relu/metax/relu_metax.h
src/infiniop/ops/relu/metax/relu_metax.h
+12
-0
src/infiniop/ops/relu/metax/relu_metax.maca
src/infiniop/ops/relu/metax/relu_metax.maca
+80
-0
src/infiniop/ops/relu/ninetoothed/build.py
src/infiniop/ops/relu/ninetoothed/build.py
+30
-0
No files found.
src/infiniop/ops/random_sample/m
aca
/random_sample_kernel.h
→
src/infiniop/ops/random_sample/m
etax
/random_sample_kernel.h
View file @
0166515c
#include "../../../devices/m
aca/maca
_kernel_common.h"
#include "../../../devices/m
etax/metax
_kernel_common.h"
#include "infinicore.h"
#include "infinicore.h"
#include <hccub/device/device_radix_sort.cuh>
#include <hccub/device/device_radix_sort.cuh>
#include <hccub/device/device_reduce.cuh>
#include <hccub/device/device_reduce.cuh>
#include <hccub/device/device_scan.cuh>
#include <hccub/device/device_scan.cuh>
namespace
op
::
random_sample
::
m
aca
{
namespace
op
::
random_sample
::
m
etax
{
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
...
@@ -62,7 +62,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
...
@@ -62,7 +62,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
const
auto
n
=
static_cast
<
int
>
(
n_
);
const
auto
n
=
static_cast
<
int
>
(
n_
);
size_t
argmax
;
size_t
argmax
;
CHECK_M
ACA
(
argMax_
<
Tval
>
(
CHECK_M
ETAX
(
argMax_
<
Tval
>
(
nullptr
,
nullptr
,
n
,
nullptr
,
nullptr
,
n
,
nullptr
,
argmax
,
nullptr
,
argmax
,
nullptr
));
nullptr
));
...
@@ -77,7 +77,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
...
@@ -77,7 +77,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
size_random
+=
align256
(
sizeof
(
Tidx
)
*
n
);
size_random
+=
align256
(
sizeof
(
Tidx
)
*
n
);
// cub device api
// cub device api
size_t
size_radix_sort
;
size_t
size_radix_sort
;
CHECK_M
ACA
((
radixSort
<
Tval
,
Tidx
>
(
CHECK_M
ETAX
((
radixSort
<
Tval
,
Tidx
>
(
nullptr
,
size_radix_sort
,
nullptr
,
size_radix_sort
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
...
@@ -85,7 +85,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
...
@@ -85,7 +85,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
nullptr
)));
nullptr
)));
size_t
size_inclusive_sum
;
size_t
size_inclusive_sum
;
CHECK_M
ACA
(
inclusiveSum
<
Tval
>
(
CHECK_M
ETAX
(
inclusiveSum
<
Tval
>
(
nullptr
,
size_inclusive_sum
,
nullptr
,
size_inclusive_sum
,
nullptr
,
n
,
nullptr
,
n
,
nullptr
));
nullptr
));
...
@@ -107,6 +107,11 @@ struct CudaTval<fp16_t> {
...
@@ -107,6 +107,11 @@ struct CudaTval<fp16_t> {
using
Type
=
half
;
using
Type
=
half
;
};
};
template
<
>
struct
CudaTval
<
bf16_t
>
{
using
Type
=
__hpcc_bfloat16
;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel
// ↓↓↓ 用于采样过程的小型 kernel
...
@@ -228,7 +233,7 @@ struct Algo {
...
@@ -228,7 +233,7 @@ struct Algo {
auto
grid
=
(
n
+
block
-
1
)
/
block
;
auto
grid
=
(
n
+
block
-
1
)
/
block
;
// sort
// sort
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
CHECK_M
ACA
(
radixSort
(
CHECK_M
ETAX
(
radixSort
(
workspace_
,
workspace_size
,
workspace_
,
workspace_size
,
logits
,
sorted
,
logits
,
sorted
,
indices
,
indices_out
,
indices
,
indices_out
,
...
@@ -238,7 +243,7 @@ struct Algo {
...
@@ -238,7 +243,7 @@ struct Algo {
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
// sum
// sum
CHECK_M
ACA
(
inclusiveSum
(
CHECK_M
ETAX
(
inclusiveSum
(
workspace_
,
workspace
,
workspace_
,
workspace
,
sorted
,
n
,
sorted
,
n
,
stream
));
stream
));
...
@@ -251,4 +256,4 @@ struct Algo {
...
@@ -251,4 +256,4 @@ struct Algo {
}
}
};
};
}
// namespace op::random_sample::m
aca
}
// namespace op::random_sample::m
etax
src/infiniop/ops/random_sample/metax/random_sample_metax.h
0 → 100644
View file @
0166515c
#ifndef __RANDOM_SAMPLE_METAX_H__
#define __RANDOM_SAMPLE_METAX_H__
#include "../random_sample.h"
DESCRIPTOR
(
metax
)
#endif // __RANDOM_SAMPLE_METAX_H__
src/infiniop/ops/random_sample/m
aca
/random_sample_m
aca
.maca
→
src/infiniop/ops/random_sample/m
etax
/random_sample_m
etax
.maca
View file @
0166515c
#include "../../../devices/m
aca/
common
_maca
.h"
#include "../../../devices/m
etax/metax_
common.h"
#include "../../../devices/m
aca/maca
_handle.h"
#include "../../../devices/m
etax/metax
_handle.h"
#include "../info.h"
#include "../info.h"
#include "random_sample_kernel.h"
#include "random_sample_kernel.h"
#include "random_sample_m
aca
.h"
#include "random_sample_m
etax
.h"
namespace op::random_sample::m
aca
{
namespace op::random_sample::m
etax
{
struct Descriptor::Opaque {
struct Descriptor::Opaque {
std::shared_ptr<device::m
aca
::Handle::Internal> internal;
std::shared_ptr<device::m
etax
::Handle::Internal> internal;
};
};
Descriptor::~Descriptor() {
Descriptor::~Descriptor() {
...
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
...
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
Descriptor **desc_ptr,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) {
infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::m
aca
::Handle *>(handle_);
auto handle = reinterpret_cast<device::m
etax
::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result);
CHECK_RESULT(result);
...
@@ -34,15 +34,16 @@ infiniStatus_t Descriptor::create(
...
@@ -34,15 +34,16 @@ infiniStatus_t Descriptor::create(
workspace_size = workspace_result.take(); \
workspace_size = workspace_result.take(); \
} break
} break
#define CASE_I(CASE, Tidx) \
#define CASE_I(CASE, Tidx) \
case CASE: \
case CASE: \
switch (info.dt_p) { \
switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
CASE_P(INFINI_DTYPE_BF16, Tidx, __hpcc_bfloat16); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
default: \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
abort(); \
default: \
} \
abort(); \
} \
break
break
switch (info.dt_i) {
switch (info.dt_i) {
...
@@ -99,4 +100,4 @@ infiniStatus_t Descriptor::calculate(
...
@@ -99,4 +100,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS;
}
}
} // namespace op::random_sample::m
aca
} // namespace op::random_sample::m
etax
src/infiniop/ops/random_sample/
cud
a/random_sample_kernel.cuh
→
src/infiniop/ops/random_sample/
nvidi
a/random_sample_kernel.cuh
View file @
0166515c
#
include
"../../../devices/
cuda/cud
a_kernel_common.cuh"
#
include
"../../../devices/
nvidia/nvidi
a_kernel_common.cuh"
#include "infinicore.h"
#include "infinicore.h"
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
#include <cub/device/device_scan.cuh>
namespace
op
::
random_sample
::
cud
a
{
namespace
op
::
random_sample
::
nvidi
a
{
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
...
@@ -193,7 +193,7 @@ struct Algo {
...
@@ -193,7 +193,7 @@ struct Algo {
argMax_
(
argMax_
(
kv_pair
,
kv_pair
,
logits
,
logits
,
n
,
static_cast
<
int
>
(
n
)
,
workspace
,
workspace
,
workspace_size
,
stream
);
workspace_size
,
stream
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
...
@@ -232,20 +232,20 @@ struct Algo {
...
@@ -232,20 +232,20 @@ struct Algo {
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
grid
=
(
n
+
block
-
1
)
/
block
;
auto
grid
=
(
n
+
block
-
1
)
/
block
;
// sort
// sort
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
fillIndices
<<<
static_cast
<
unsigned
int
>
(
grid
)
,
static_cast
<
unsigned
int
>
(
block
)
,
0
,
stream
>>>
(
indices
,
static_cast
<
int
>
(
n
)
);
CHECK_CUDA
(
radixSort
(
CHECK_CUDA
(
radixSort
(
workspace_
,
workspace_size
,
workspace_
,
workspace_size
,
logits
,
sorted
,
logits
,
sorted
,
indices
,
indices_out
,
indices
,
indices_out
,
n
,
static_cast
<
int
>
(
n
)
,
stream
));
stream
));
// softmax
// softmax
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
partialSoftmaxKernel
<<<
static_cast
<
unsigned
int
>
(
grid
)
,
static_cast
<
unsigned
int
>
(
block
)
,
0
,
stream
>>>
(
sorted
,
static_cast
<
int
>
(
n
)
,
temperature
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
// sum
// sum
CHECK_CUDA
(
inclusiveSum
(
CHECK_CUDA
(
inclusiveSum
(
workspace_
,
workspace
,
workspace_
,
workspace
,
sorted
,
n
,
sorted
,
static_cast
<
int
>
(
n
)
,
stream
));
stream
));
// sample
// sample
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
...
@@ -256,4 +256,4 @@ struct Algo {
...
@@ -256,4 +256,4 @@ struct Algo {
}
}
};
};
}
// namespace op::random_sample::
cud
a
}
// namespace op::random_sample::
nvidi
a
src/infiniop/ops/random_sample/
cud
a/random_sample_
cud
a.cu
→
src/infiniop/ops/random_sample/
nvidi
a/random_sample_
nvidi
a.cu
View file @
0166515c
#
include
"../../../devices/
cuda/cud
a_handle.cuh"
#
include
"../../../devices/
nvidia/nvidi
a_handle.cuh"
#include "../info.h"
#include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh"
#include "random_sample_kernel.cuh"
#include "random_sample_nvidia.cuh"
namespace
op
::
random_sample
::
cud
a
{
namespace
op
::
random_sample
::
nvidi
a
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
};
Descriptor
::~
Descriptor
()
{
Descriptor
::~
Descriptor
()
{
...
@@ -18,7 +18,7 @@ infiniStatus_t Descriptor::create(
...
@@ -18,7 +18,7 @@ infiniStatus_t Descriptor::create(
Descriptor
**
desc_ptr
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
CHECK_RESULT
(
result
);
...
@@ -99,4 +99,4 @@ infiniStatus_t Descriptor::calculate(
...
@@ -99,4 +99,4 @@ infiniStatus_t Descriptor::calculate(
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace op::random_sample::
cud
a
}
// namespace op::random_sample::
nvidi
a
src/infiniop/ops/random_sample/
cud
a/random_sample_
cud
a.cuh
→
src/infiniop/ops/random_sample/
nvidi
a/random_sample_
nvidi
a.cuh
View file @
0166515c
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../random_sample.h"
#include "../random_sample.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif // __RANDOM_SAMPLE_CUDA_CUH__
#endif // __RANDOM_SAMPLE_CUDA_CUH__
src/infiniop/ops/random_sample/operator.cc
View file @
0166515c
...
@@ -5,11 +5,11 @@
...
@@ -5,11 +5,11 @@
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#include "cpu/random_sample_cpu.h"
#endif
#endif
#ifdef
ENABLE_CUDA
_API
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
cud
a/random_sample_
cud
a.cuh"
#include "
nvidi
a/random_sample_
nvidi
a.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "m
aca
/random_sample_m
aca
.h"
#include "m
etax
/random_sample_m
etax
.h"
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
#include "ascend/random_sample_aclnn.h"
#include "ascend/random_sample_aclnn.h"
...
@@ -35,11 +35,14 @@ infiniopCreateRandomSampleDescriptor(
...
@@ -35,11 +35,14 @@ infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
...
@@ -68,11 +71,14 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
...
@@ -68,11 +71,14 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
...
@@ -111,11 +117,14 @@ __C infiniStatus_t infiniopRandomSample(
...
@@ -111,11 +117,14 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
...
@@ -141,11 +150,14 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
...
@@ -141,11 +150,14 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
...
...
src/infiniop/ops/rearrange/maca/rearrange_maca.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __REARRANGE_MACA_H__
#define __REARRANGE_MACA_H__
#include "../rearrange.h"
DESCRIPTOR
(
maca
)
#endif // __REARRANGE_MACA_H__
src/infiniop/ops/rearrange/m
aca
/rearrange_kernel.h
→
src/infiniop/ops/rearrange/m
etax
/rearrange_kernel.h
View file @
0166515c
#ifndef __REARRANGE_M
ACA
_KERNEL_H__
#ifndef __REARRANGE_M
ETAX
_KERNEL_H__
#define __REARRANGE_M
ACA
_KERNEL_H__
#define __REARRANGE_M
ETAX
_KERNEL_H__
#include "../../../devices/m
aca/
common
_maca
.h"
#include "../../../devices/m
etax/metax_
common.h"
#include "../../../devices/m
aca/maca
_kernel_common.h"
#include "../../../devices/m
etax/metax
_kernel_common.h"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
#define ARRAY_TYPE_SIZE size_t
...
@@ -328,4 +328,4 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
...
@@ -328,4 +328,4 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
return
utils
::
Result
<
void
*>
(
kernel_func
);
return
utils
::
Result
<
void
*>
(
kernel_func
);
}
}
#endif // __REARRANGE_M
ACA
_KERNEL_H__
#endif // __REARRANGE_M
ETAX
_KERNEL_H__
src/infiniop/ops/rearrange/metax/rearrange_metax.h
0 → 100644
View file @
0166515c
#ifndef __REARRANGE_METAX_H__
#define __REARRANGE_METAX_H__
#include "../rearrange.h"
DESCRIPTOR
(
metax
)
#endif // __REARRANGE_METAX_H__
src/infiniop/ops/rearrange/m
aca
/rearrange_m
aca
.maca
→
src/infiniop/ops/rearrange/m
etax
/rearrange_m
etax
.maca
View file @
0166515c
#include "../../../tensor.h"
#include "../../../tensor.h"
#include "rearrange_kernel.h"
#include "rearrange_kernel.h"
#include "rearrange_m
aca
.h"
#include "rearrange_m
etax
.h"
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cmath>
#include <memory>
#include <memory>
#include <stdint.h>
#include <stdint.h>
#include <vector>
#include <vector>
namespace op::rearrange::m
aca
{
namespace op::rearrange::m
etax
{
struct Descriptor::Opaque {
struct Descriptor::Opaque {
std::shared_ptr<device::m
aca
::Handle::Internal> internal;
std::shared_ptr<device::m
etax
::Handle::Internal> internal;
};
};
Descriptor::~Descriptor() {
Descriptor::~Descriptor() {
...
@@ -47,7 +47,7 @@ infiniStatus_t Descriptor::create(
...
@@ -47,7 +47,7 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor(
*desc_ptr = new Descriptor(
std::move(*meta),
std::move(*meta),
new Opaque{reinterpret_cast<device::m
aca
::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::m
etax
::Handle *>(handle)->internal()},
handle->device, handle->device_id);
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS;
}
}
...
@@ -429,18 +429,18 @@ infiniStatus_t launchKernel(
...
@@ -429,18 +429,18 @@ infiniStatus_t launchKernel(
infiniStatus_t Descriptor::calculate(
infiniStatus_t Descriptor::calculate(
void *y,
void *y,
const void *x,
const void *x,
void *stream) const {
void *stream
_
) const {
auto
maca_
stream = reinterpret_cast<hcStream_t>(stream);
auto stream = reinterpret_cast<hcStream_t>(stream
_
);
// 如果没有维度,直接进行内存拷贝
// 如果没有维度,直接进行内存拷贝
if (_meta.ndim() == 0) {
if (_meta.ndim() == 0) {
auto err = hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice,
maca_
stream);
auto err = hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, stream);
if (err != hcSuccess) {
if (err != hcSuccess) {
return INFINI_STATUS_INTERNAL_ERROR;
return INFINI_STATUS_INTERNAL_ERROR;
}
}
CHECK_OR_RETURN(hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice,
maca_
stream) == hcSuccess,
CHECK_OR_RETURN(hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, stream) == hcSuccess,
INFINI_STATUS_INTERNAL_ERROR);
INFINI_STATUS_INTERNAL_ERROR);
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS;
}
}
...
@@ -449,7 +449,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -449,7 +449,7 @@ infiniStatus_t Descriptor::calculate(
int max_threads = _opaque->internal->maxThreadsPerBlock();
int max_threads = _opaque->internal->maxThreadsPerBlock();
// 准备参数
// 准备参数
auto params_result = prepareRearrangeParams(_meta, std::min(M
ACA
_BLOCK_SIZE_1024, max_threads));
auto params_result = prepareRearrangeParams(_meta, std::min(M
ETAX
_BLOCK_SIZE_1024, max_threads));
CHECK_RESULT(params_result);
CHECK_RESULT(params_result);
auto params = params_result.take();
auto params = params_result.take();
...
@@ -469,10 +469,10 @@ infiniStatus_t Descriptor::calculate(
...
@@ -469,10 +469,10 @@ infiniStatus_t Descriptor::calculate(
size_t block_size = params.block_len_total;
size_t block_size = params.block_len_total;
if (block_size <= M
ACA
_BLOCK_SIZE_512) {
if (block_size <= M
ETAX
_BLOCK_SIZE_512) {
status = launchKernel<M
ACA
_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(),
maca_
stream);
status = launchKernel<M
ETAX
_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(), stream);
} else if (block_size <= M
ACA
_BLOCK_SIZE_1024) {
} else if (block_size <= M
ETAX
_BLOCK_SIZE_1024) {
status = launchKernel<M
ACA
_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(),
maca_
stream);
status = launchKernel<M
ETAX
_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), stream);
} else {
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
}
...
@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate(
...
@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate(
return status;
return status;
}
}
} // namespace op::rearrange::m
aca
} // namespace op::rearrange::m
etax
src/infiniop/ops/rearrange/
cud
a/rearrange_kernel.cuh
→
src/infiniop/ops/rearrange/
nvidi
a/rearrange_kernel.cuh
View file @
0166515c
#ifndef __REARRANGE_CUDA_KERNEL_H__
#ifndef __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/
cuda/cud
a_common.cuh"
#include "../../../devices/
nvidia/nvidi
a_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
#define ARRAY_TYPE_SIZE size_t
...
@@ -63,13 +63,13 @@ struct Constraint {
...
@@ -63,13 +63,13 @@ struct Constraint {
size_t remaining \
size_t remaining \
= blockIdx.x; \
= blockIdx.x; \
\
\
for (
ssize
_t i = grid_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \
if (constraint_num > 0) { \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \
if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \
} \
...
@@ -80,7 +80,7 @@ struct Constraint {
...
@@ -80,7 +80,7 @@ struct Constraint {
/* 将结果存入共享内存 */
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
shared_dst_offset = dst_offset; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \
} \
} \
} \
...
@@ -92,18 +92,18 @@ struct Constraint {
...
@@ -92,18 +92,18 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \
} \
\
\
for (
ssize
_t i = block_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \
if (constraint_num > 0) { \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \
if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \
return; \
...
@@ -115,7 +115,7 @@ struct Constraint {
...
@@ -115,7 +115,7 @@ struct Constraint {
\
\
src_offset += remaining * src_block_stride.a[0]; \
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
for (
ssize
_t j = 0; j < constraint_num; j++) {
\
for (
ptrdiff
_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \
if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \
return; \
...
@@ -133,7 +133,7 @@ struct Constraint {
...
@@ -133,7 +133,7 @@ struct Constraint {
ptrdiff_t dst_offset = 0; \
ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \
size_t remaining = blockIdx.x; \
\
\
for (
ssize
_t i = grid_array_size - 1; i >= 0; i--) {
\
for (
ptrdiff
_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
...
@@ -152,7 +152,7 @@ struct Constraint {
...
@@ -152,7 +152,7 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
\
\
for (
ssize
_t i = block_array_size - 1; i > 0; i--) {
\
for (
ptrdiff
_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
/* 计算偏移量 */
\
...
...
src/infiniop/ops/rearrange/
cud
a/rearrange_
cud
a.cu
→
src/infiniop/ops/rearrange/
nvidi
a/rearrange_
nvidi
a.cu
View file @
0166515c
#include "../../../devices/
cuda/cud
a_common.cuh"
#include "../../../devices/
nvidia/nvidi
a_common.cuh"
#include "../../../devices/
cuda/cud
a_kernel_common.cuh"
#include "../../../devices/
nvidia/nvidi
a_kernel_common.cuh"
#include "../../../tensor.h"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include "rearrange_kernel.cuh"
#include "rearrange_nvidia.cuh"
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cmath>
#include <memory>
#include <memory>
#include <stdint.h>
#include <stdint.h>
#include <vector>
#include <vector>
namespace
op
::
rearrange
::
cud
a
{
namespace
op
::
rearrange
::
nvidi
a
{
struct
Descriptor
::
Opaque
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
};
Descriptor
::~
Descriptor
()
{
Descriptor
::~
Descriptor
()
{
...
@@ -49,7 +49,7 @@ infiniStatus_t Descriptor::create(
...
@@ -49,7 +49,7 @@ infiniStatus_t Descriptor::create(
*
desc_ptr
=
new
Descriptor
(
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
*
meta
),
std
::
move
(
*
meta
),
new
Opaque
{
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle
)
->
internal
()},
new
Opaque
{
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
...
@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
split_dims
[
j
].
array_struct_idx_block
=
static_cast
<
int
>
(
block_dim
)
;
block_dim
+=
1
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
...
@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
...
@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
split_dims
[
j
].
array_struct_idx_grid
=
static_cast
<
int
>
(
grid_len
.
size
()
-
1
)
;
}
}
}
}
...
@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
...
@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
CHECK_OR_RETURN
(
cudaLaunchKernel
(
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
kernel_func
,
grid_size
,
BLOCK_SIZE
,
static_cast
<
unsigned
int
>
(
grid_size
)
,
static_cast
<
unsigned
int
>
(
BLOCK_SIZE
)
,
args
,
0
,
stream
)
args
,
0
,
stream
)
==
cudaSuccess
,
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
INFINI_STATUS_INTERNAL_ERROR
);
...
@@ -482,4 +482,4 @@ infiniStatus_t Descriptor::calculate(
...
@@ -482,4 +482,4 @@ infiniStatus_t Descriptor::calculate(
return
status
;
return
status
;
}
}
}
// namespace op::rearrange::
cud
a
}
// namespace op::rearrange::
nvidi
a
src/infiniop/ops/rearrange/
cud
a/rearrange_
cud
a.cuh
→
src/infiniop/ops/rearrange/
nvidi
a/rearrange_
nvidi
a.cuh
View file @
0166515c
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../rearrange.h"
#include "../rearrange.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif // __REARRANGE_CUDA_H__
#endif // __REARRANGE_CUDA_H__
src/infiniop/ops/rearrange/operator.cc
View file @
0166515c
...
@@ -8,12 +8,11 @@
...
@@ -8,12 +8,11 @@
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#include "ascend/rearrange_ascend.h"
#endif
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#ifdef ENABLE_CUDA_API
#include "nvidia/rearrange_nvidia.cuh"
#include "cuda/rearrange_cuda.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "m
aca
/rearrange_m
aca
.h"
#include "m
etax
/rearrange_m
etax
.h"
#endif
#endif
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
...
@@ -39,11 +38,14 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
...
@@ -39,11 +38,14 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -72,11 +74,14 @@ __C infiniStatus_t infiniopRearrange(
...
@@ -72,11 +74,14 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#endif
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
default:
default:
...
@@ -102,12 +107,14 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
...
@@ -102,12 +107,14 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
default:
default:
...
...
src/infiniop/ops/relu/cpu/relu_cpu.cc
0 → 100644
View file @
0166515c
#include "relu_cpu.h"
namespace
op
::
relu
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
x_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
y_shape
=
out_desc
->
shape
();
const
auto
&
x_shape
=
x_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
y_shape
,
x_shape
);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
ReluOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
ReluOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
ReluOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
ReluOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::relu::cpu
src/infiniop/ops/relu/cpu/relu_cpu.h
0 → 100644
View file @
0166515c
#ifndef __RELU_CPU_H__
#define __RELU_CPU_H__
#include <algorithm>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
relu
,
cpu
)
namespace
op
::
relu
::
cpu
{
typedef
struct
ReluOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
return
std
::
max
<
T
>
(
x
,
0
);
}
}
ReluOp
;
}
// namespace op::relu::cpu
#endif // __RELU_CPU_H__
src/infiniop/ops/relu/metax/relu_metax.h
0 → 100644
View file @
0166515c
#ifndef __RELU_METAX_API_H__
#define __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
relu
,
metax
)
#endif
#endif // __RELU_METAX_API_H__
src/infiniop/ops/relu/metax/relu_metax.maca
0 → 100644
View file @
0166515c
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/metax/metax_common.h"
#include "relu_metax.h"
namespace op::relu::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
auto x_data{const_cast<void *>(inputs[0])};
auto x_shape{x_shape_vec.data()};
auto x_strides{x_strides_vec.data()};
const NineToothedTensor x{x_data, x_shape, x_strides};
const auto &y_shape_{_info.getOutputShape()};
const auto &y_strides_{_info.getOutputStrides()};
std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
auto y_data{output};
auto y_shape{y_shape_vec.data()};
auto y_strides{y_strides_vec.data()};
const NineToothedTensor y{y_data, y_shape, y_strides};
constexpr auto block_size{1024};
switch (_dtype) {
case INFINI_DTYPE_F16:
case INFINI_DTYPE_F32:
case INFINI_DTYPE_F64:
case INFINI_DTYPE_BF16:
if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::metax
#endif
src/infiniop/ops/relu/ninetoothed/build.py
0 → 100644
View file @
0166515c
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment