Unverified Commit e698ef6b authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

issue/486 Adapt seven operators to Hygon machines.


Co-authored-by: default avatarzhuyue <zhuyue@qiyuanlab.com>
parent 3959c943
......@@ -19,7 +19,7 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
- 摩尔线程 GPU;
- 天数智芯 GPU;
- 沐曦 GPU;
- 光 DCU;
- 光 DCU;
- 华为昇腾 NPU;
- 寒武纪 MLU;
- 昆仑芯 XPU;
......@@ -50,7 +50,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
| `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n
| `--iluvatar-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译光 DCU 接口实现 | n
| `--hygon-dcu=[y\|n]` | 是否编译光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
......
......@@ -45,7 +45,7 @@ typedef enum {
INFINI_DEVICE_MOORE = 5,
INFINI_DEVICE_ILUVATAR = 6,
INFINI_DEVICE_KUNLUN = 7,
INFINI_DEVICE_SUGON = 8,
INFINI_DEVICE_HYGON = 8,
INFINI_DEVICE_TYPE_COUNT
} infiniDevice_t;
......
......@@ -12,7 +12,7 @@ void printUsage() {
std::cout << "infiniccl-test --<device>" << std::endl
<< std::endl;
std::cout << " --<device>" << std::endl;
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon)." << std::endl
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon)." << std::endl
<< std::endl;
std::cout << "The program will run tests on all visible devices of the specified device type."
<< " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
......@@ -44,7 +44,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
else {
printUsage();
}
......
......@@ -4,7 +4,7 @@
#include "../infiniccl_impl.h"
// Windows does not support CUDA
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL(cuda)
#else
INFINICCL_DEVICE_API_NOOP(cuda)
......
......@@ -20,6 +20,7 @@ __C infiniStatus_t infinicclCommInitAll(
switch (device_type) {
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda);
COMM_INIT_ALL(INFINI_DEVICE_ILUVATAR, cuda);
COMM_INIT_ALL(INFINI_DEVICE_HYGON, cuda);
COMM_INIT_ALL(INFINI_DEVICE_ASCEND, ascend);
COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon);
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
......@@ -44,6 +45,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
switch (comm->device_type) {
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda);
COMM_DESTROY(INFINI_DEVICE_ILUVATAR, cuda);
COMM_DESTROY(INFINI_DEVICE_HYGON, cuda);
COMM_DESTROY(INFINI_DEVICE_ASCEND, ascend);
COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon);
COMM_DESTROY(INFINI_DEVICE_METAX, metax);
......@@ -75,6 +77,7 @@ __C infiniStatus_t infinicclAllReduce(
switch (comm->device_type) {
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda);
ALL_REDUCE(INFINI_DEVICE_ILUVATAR, cuda);
ALL_REDUCE(INFINI_DEVICE_HYGON, cuda);
ALL_REDUCE(INFINI_DEVICE_ASCEND, ascend);
ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon);
ALL_REDUCE(INFINI_DEVICE_METAX, metax);
......
......@@ -22,7 +22,7 @@ void printUsage() {
std::cout << " Path to the test gguf file" << std::endl
<< std::endl;
std::cout << " --<device>[:id]" << std::endl;
std::cout << " (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon) and device ID (optional). CPU by default." << std::endl
std::cout << " (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|hygon) and device ID (optional). CPU by default." << std::endl
<< std::endl;
std::cout << " --warmup <warmups>" << std::endl;
std::cout << " (Optional) Number of warmups to perform before timing. Default to 0." << std::endl
......@@ -78,7 +78,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
else if (arg == "--warmup" && i + 1 < argc) {
args.warmups = std::stoi(argv[++i]);
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/cpu_handle.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/nvidia_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -62,6 +62,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, hygon);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -101,6 +104,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, hygon);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -104,4 +104,16 @@ infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
} // namespace iluvatar
namespace hygon {
Handle::Handle(int device_id)
: nvidia::Handle(INFINI_DEVICE_HYGON, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS;
}
} // namespace hygon
} // namespace device
......@@ -35,6 +35,17 @@ public:
} // namespace iluvatar
namespace hygon {
struct Handle : public nvidia::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace hygon
} // namespace device
#endif // __INFINIOP_CUDA_HANDLE_H__
#ifdef ENABLE_SUGON_CUDA_API
#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
#ifndef __INFINIOP_CUDA_KERNEL_COMMON_CUH__
#define __INFINIOP_CUDA_KERNEL_COMMON_CUH__
#if defined(ENABLE_HYGON_API)
#define INFINIOP_CUDA_KERNEL __launch_bounds__(1024) __global__ void
#else
#define INFINIOP_CUDA_KERNEL __global__ void
#endif
......@@ -15,8 +18,14 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
#ifdef ENABLE_HYGON_API
// Hygon DCU uses different bfloat16 type definitions
using cuda_bfloat16 = __nv_bfloat16;
using cuda_bfloat162 = __nv_bfloat162;
#else
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
#endif
namespace device::nvidia {
......@@ -41,7 +50,7 @@ exp_(const float val) {
return expf(val);
}
#ifndef ENABLE_ILUVATAR_API
#if !defined(ENABLE_ILUVATAR_API) && !defined(ENABLE_HYGON_API)
__forceinline__ __device__ long double
exp_(const long double val) {
return expl(val);
......@@ -62,3 +71,5 @@ __forceinline__ __device__ __nv_bfloat16
exp_(const __nv_bfloat16 x) {
return hexp(x);
}
#endif // __INFINIOP_CUDA_KERNEL_COMMON_CUH__
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/causal_softmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/causal_softmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang)
#endif
......@@ -84,6 +87,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
......@@ -125,6 +131,9 @@ __C infiniStatus_t infiniopCausalSoftmax(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang)
#endif
......@@ -161,6 +170,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DESTROY(INFINI_DEVICE_CAMBRICON, bang)
#endif
......
......@@ -43,7 +43,7 @@ infiniStatus_t Descriptor::calculate(
void *stream) const {
cudaDataType a_type, b_type, c_type;
#ifdef ENABLE_ILUVATAR_API
#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
cudaDataType compute_type;
#else
cublasComputeType_t compute_type;
......@@ -52,7 +52,7 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
a_type = b_type = c_type = CUDA_R_16F;
#ifdef ENABLE_ILUVATAR_API
#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
compute_type = CUDA_R_32F;
#else
compute_type = CUBLAS_COMPUTE_32F;
......@@ -60,7 +60,7 @@ infiniStatus_t Descriptor::calculate(
break;
case INFINI_DTYPE_BF16:
a_type = b_type = c_type = CUDA_R_16BF;
#ifdef ENABLE_ILUVATAR_API
#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
compute_type = CUDA_R_32F;
#else
compute_type = CUBLAS_COMPUTE_32F;
......@@ -68,10 +68,8 @@ infiniStatus_t Descriptor::calculate(
break;
case INFINI_DTYPE_F32:
a_type = b_type = c_type = CUDA_R_32F;
#if defined ENABLE_ILUVATAR_API
#if defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
compute_type = CUDA_R_32F;
#elif defined ENABLE_SUGON_CUDA_API
compute_type = CUBLAS_COMPUTE_32F;
#else
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/gemm_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/gemm_nvidia.cuh"
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -96,6 +99,9 @@ infiniopGetGemmWorkspaceSize(
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -148,6 +154,9 @@ __C infiniStatus_t infiniopGemm(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -190,6 +199,9 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/random_sample_nvidia.cuh"
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -50,6 +50,9 @@ infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -95,6 +98,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -150,6 +156,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -192,6 +201,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......
......@@ -2,6 +2,7 @@
#define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
......@@ -30,7 +31,7 @@ struct Constraint {
// 定义宏生成内核函数
#define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size) \
extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \
extern "C" INFINIOP_CUDA_KERNEL rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \
void *__restrict__ dst, \
const void *__restrict__ src, \
const size_t block_dim, \
......
......@@ -8,7 +8,7 @@
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rearrange_nvidia.cuh"
#endif
#ifdef ENABLE_CAMBRICON_API
......@@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -96,6 +99,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......@@ -138,6 +144,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/rms_norm_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rms_norm_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
......@@ -52,6 +52,9 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -91,6 +94,9 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -131,6 +137,9 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -170,6 +179,9 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/rope_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/rope_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
......@@ -56,6 +56,9 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
......@@ -95,6 +98,9 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
......@@ -143,6 +149,9 @@ __C infiniStatus_t infiniopRoPE(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
......@@ -183,6 +192,9 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
#include "nvidia/swiglu_nvidia.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
......@@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -91,6 +94,9 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -138,6 +144,9 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......@@ -180,6 +189,9 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
......
......@@ -52,13 +52,22 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
Tdata max_ = data_ptr[0];
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
#ifdef ENABLE_HYGON_API
max_ = (data_ptr[i] > max_) ? data_ptr[i] : max_;
#else
max_ = cub::Max()(max_, data_ptr[i]);
#endif
}
using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
#ifdef ENABLE_HYGON_API
return BlockReduce(temp_storage).Reduce(
max_, [](const Tdata &a, const Tdata &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
#endif
}
} // namespace op::common_cuda::reduce_op
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment