Unverified Commit 0166515c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge branch 'main' into issue/300

parents f0300ff3 a23c4d13
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "sub_metax.h"
namespace op::sub::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SubOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sub::metax
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
namespace op::mul::cuda {
#include "../cuda/kernel.cuh"
#include "sub_nvidia.cuh"
namespace op::sub::nvidia {
Descriptor::~Descriptor() = default;
......@@ -11,7 +13,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
......@@ -20,7 +22,7 @@ infiniStatus_t Descriptor::create(
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
......@@ -43,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SubOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::cuda
} // namespace op::sub::nvidia
#ifndef __SUB_CUDA_API_H__
#define __SUB_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(sub, cuda)
ELEMENTWISE_DESCRIPTOR(sub, nvidia)
#endif // __SUB_CUDA_API_H__
......@@ -5,8 +5,11 @@
#ifdef ENABLE_CPU_API
#include "cpu/sub_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/sub_cuda.cuh"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/sub_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/sub_metax.h"
#endif
__C infiniStatus_t infiniopCreateSubDescriptor(
......@@ -30,8 +33,14 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default:
......@@ -46,14 +55,20 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::sub::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -82,8 +97,14 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default:
......@@ -106,8 +127,14 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default:
......
#ifndef __SWIGLU_CUDA_H__
#define __SWIGLU_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::swiglu::cuda {
typedef struct SwiGLUOp {
private:
......@@ -14,13 +10,13 @@ private:
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, __nv_bfloat162>) {
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
float x0 = __bfloat162float(__low2bfloat16(x));
float x1 = __bfloat162float(__high2bfloat16(x));
float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0)));
float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1)));
return __floats2bfloat162_rn(sig0, sig1);
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float xf = __bfloat162float(x);
return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf))));
} else if constexpr (std::is_same_v<T, float>) {
......@@ -38,8 +34,8 @@ public:
return __hmul2(__hmul2(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(__hmul(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, __nv_bfloat162>) {
__nv_bfloat162 sig = sigmoid(gate);
} else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
cuda_bfloat162 sig = sigmoid(gate);
float gate0 = __bfloat162float(__low2bfloat16(gate));
float gate1 = __bfloat162float(__high2bfloat16(gate));
float sig0 = __bfloat162float(__low2bfloat16(sig));
......@@ -49,8 +45,8 @@ public:
float res0 = __fmul_rn(__fmul_rn(gate0, sig0), up0);
float res1 = __fmul_rn(__fmul_rn(gate1, sig1), up1);
return __floats2bfloat162_rn(res0, res1);
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
__nv_bfloat16 sig = sigmoid(gate);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
cuda_bfloat16 sig = sigmoid(gate);
float gatef = __bfloat162float(gate);
float sigf = __bfloat162float(sig);
float upf = __bfloat162float(up);
......
#ifndef __SWIGLU_MACA_API_H__
#define __SWIGLU_MACA_API_H__
#include "../../../elementwise/maca/elementwise_maca_api.h"
ELEMENTWISE_DESCRIPTOR(swiglu, maca)
#endif // __SWIGLU_MACA_API_H__
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace op::swiglu::maca {
typedef struct SwiGLUOp {
private:
template <typename T>
__device__ __forceinline__ T sigmoid(const T &x) const {
if constexpr (std::is_same_v<T, half2>) {
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, float>) {
return __frcp_rn(__fadd_rn(1, __expf(-x)));
} else {
return 1 / (1 + std::exp(-x));
}
}
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &up, const T &gate) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmul2(__hmul2(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(__hmul(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
} else {
return gate * sigmoid(gate) * up;
}
}
} SwiGLUOp;
} // namespace op::swiglu::maca
#endif
#ifndef __SWIGLU_METAX_API_H__
#define __SWIGLU_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(swiglu, metax)
#endif // __SWIGLU_METAX_API_H__
#include "swiglu_maca.h"
#include "swiglu_maca_internal.h"
#include "swiglu_metax.h"
namespace op::swiglu::maca {
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::swiglu::metax {
Descriptor::~Descriptor() = default;
......@@ -11,7 +14,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
......@@ -20,11 +23,11 @@ infiniStatus_t Descriptor::create(
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MACA elementwise descriptor
CREATE_ELEMENTWISE_MACA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
......@@ -42,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::maca
} // namespace op::swiglu::metax
#include "swiglu_cuda.cuh"
#include "swiglu_cuda_internal.cuh"
#include "swiglu_nvidia.cuh"
namespace op::swiglu::cuda {
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
namespace op::swiglu::nvidia {
Descriptor::~Descriptor() = default;
......@@ -11,7 +14,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
......@@ -42,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::cuda
} // namespace op::swiglu::nvidia
#ifndef __SWIGLU_CUDA_API_H__
#define __SWIGLU_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(swiglu, cuda)
ELEMENTWISE_DESCRIPTOR(swiglu, nvidia)
#endif // __SWIGLU_CUDA_API_H__
......@@ -5,14 +5,14 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/swiglu_cuda.cuh"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/swiglu_nvidia.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#include "maca/swiglu_maca.h"
#include "metax/swiglu_metax.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
......@@ -39,14 +39,17 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -83,20 +86,23 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -104,12 +110,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
}
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
}
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
......@@ -142,14 +143,17 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -188,14 +192,17 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on CUDA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*
* Important Note: This is a device-independent header file containing reduce kernels
* for all cuda-supporting platforms. Include device-specific headers
* (such as <cub/block/block_reduce.cuh> for nvidia) in your source file
* and then include this file for proper usage.
*/
namespace op::common_cuda::reduce_op {
......
#ifndef __INFINIOP_REDUCE_MACA_H__
#define __INFINIOP_REDUCE_MACA_H__
#include <hccub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on MACA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*/
namespace op::common_maca::reduce_op {
// Sum(x^2) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
Tcompute ss = 0;
// Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
}
// Use CUB block-level reduction
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(ss);
}
// Sum(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
Tcompute s = 0;
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
s += Tcompute(data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(s);
}
// Max(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata>
__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
Tdata max_ = data_ptr[0];
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
max_ = cub::Max()(max_, data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
}
} // namespace op::common_maca::reduce_op
#endif
#include "test.h"
#include <infinirt.h>
struct ParsedArgs {
infiniDevice_t device_type = INFINI_DEVICE_CPU;
};
void printUsage() {
std::cout << "Usage:" << std::endl
<< " infinirt-test [--<device>]" << std::endl
<< std::endl
<< "Options:" << std::endl
<< " --<device> Specify the device type." << std::endl
<< std::endl
<< "Available devices:" << std::endl
<< " cpu - Default" << std::endl
<< " nvidia" << std::endl
<< " cambricon" << std::endl
<< " ascend" << std::endl
<< " metax" << std::endl
<< " moore" << std::endl
<< " iluvatar" << std::endl
<< " kunlun" << std::endl
<< " sugon" << std::endl
<< std::endl;
exit(EXIT_FAILURE);
}
ParsedArgs parseArgs(int argc, char *argv[]) {
ParsedArgs args;
if (argc < 2) {
return args; // 默认使用 CPU
}
std::string arg = argv[1];
if (arg == "--help" || arg == "-h") {
printUsage();
}
try {
#define PARSE_DEVICE(FLAG, DEVICE) \
if (arg == FLAG) { \
args.device_type = DEVICE; \
}
// clang-format off
PARSE_DEVICE("--cpu", INFINI_DEVICE_CPU)
else PARSE_DEVICE("--nvidia", INFINI_DEVICE_NVIDIA)
else PARSE_DEVICE("--cambricon", INFINI_DEVICE_CAMBRICON)
else PARSE_DEVICE("--ascend", INFINI_DEVICE_ASCEND)
else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
else {
printUsage();
}
// clang-format on
#undef PARSE_DEVICE
} catch (const std::exception &) {
printUsage();
}
return args;
}
int main(int argc, char *argv[]) {
ParsedArgs args = parseArgs(argc, argv);
std::cout << "Testing Device: " << args.device_type << std::endl;
infiniDevice_t device = args.device_type;
// 获取设备总数
std::vector<int> deviceCounts(INFINI_DEVICE_TYPE_COUNT, 0);
if (infinirtGetAllDeviceCount(deviceCounts.data()) != INFINI_STATUS_SUCCESS) {
std::cerr << "Failed to get total device count." << std::endl;
return 1;
}
int numDevices = deviceCounts[device];
std::cout << "Device Type: " << device << " | Available Devices: " << numDevices << std::endl;
if (numDevices == 0) {
std::cout << "Device type " << device << " has no available devices." << std::endl;
return 0;
}
for (int deviceId = 0; deviceId < numDevices; ++deviceId) {
if (!testSetDevice(device, deviceId)) {
return 1;
}
size_t dataSize[] = {1 << 10, 4 << 10, 2 << 20, 1L << 30};
for (size_t size : dataSize) {
if (!testMemcpy(device, deviceId, size)) {
return 1;
}
}
}
return 0;
}
#include "test.h"
#include <cstring>
#include <infinirt.h>
#include <iostream>
bool testMemcpy(infiniDevice_t device, int deviceId, size_t dataSize) {
std::cout << "==============================================\n"
<< "Testing memcpy on Device ID: " << deviceId << "\n"
<< "==============================================" << std::endl;
// 分配主机内存
std::cout << "[Device " << deviceId << "] Allocating host memory: " << dataSize * sizeof(float) << " bytes" << std::endl;
std::vector<float> hostData(dataSize, 1.23f);
std::vector<float> hostCopy(dataSize, 0.0f);
// 分配设备内存
void *deviceSrc = nullptr, *deviceDst = nullptr;
size_t dataSizeInBytes = dataSize * sizeof(float);
std::cout << "[Device " << deviceId << "] Allocating device memory: " << dataSizeInBytes << " bytes" << std::endl;
if (infinirtMalloc(&deviceSrc, dataSizeInBytes) != INFINI_STATUS_SUCCESS) {
std::cerr << "[Device " << deviceId << "] Failed to allocate device memory for deviceSrc." << std::endl;
return false;
}
if (infinirtMalloc(&deviceDst, dataSizeInBytes) != INFINI_STATUS_SUCCESS) {
std::cerr << "[Device " << deviceId << "] Failed to allocate device memory for deviceDst." << std::endl;
infinirtFree(deviceSrc);
return false;
}
// 复制数据到设备
std::cout << "[Device " << deviceId << "] Copying data from host to device..." << std::endl;
if (infinirtMemcpy(deviceSrc, hostData.data(), dataSizeInBytes, INFINIRT_MEMCPY_H2D) != INFINI_STATUS_SUCCESS) {
std::cerr << "[Device " << deviceId << "] Failed to copy data from host to device." << std::endl;
infinirtFree(deviceSrc);
infinirtFree(deviceDst);
return false;
}
// 设备内存间复制
std::cout << "[Device " << deviceId << "] Copying data between device memory (D2D)..." << std::endl;
if (infinirtMemcpy(deviceDst, deviceSrc, dataSizeInBytes, INFINIRT_MEMCPY_D2D) != INFINI_STATUS_SUCCESS) {
std::cerr << "[Device " << deviceId << "] Failed to copy data from device to device." << std::endl;
infinirtFree(deviceSrc);
infinirtFree(deviceDst);
return false;
}
// 设备数据复制回主机
std::cout << "[Device " << deviceId << "] Copying data from device back to host..." << std::endl;
if (infinirtMemcpy(hostCopy.data(), deviceDst, dataSizeInBytes, INFINIRT_MEMCPY_D2H) != INFINI_STATUS_SUCCESS) {
std::cerr << "[Device " << deviceId << "] Failed to copy data from device to host." << std::endl;
infinirtFree(deviceSrc);
infinirtFree(deviceDst);
return false;
}
// 数据验证
std::cout << "[Device " << deviceId << "] Validating copied data..." << std::endl;
if (std::memcmp(hostData.data(), hostCopy.data(), dataSizeInBytes) != 0) {
std::cerr << "[Device " << deviceId << "] Data mismatch between hostData and hostCopy." << std::endl;
infinirtFree(deviceSrc);
infinirtFree(deviceDst);
return false;
}
std::cout << "[Device " << deviceId << "] Data copied correctly!" << std::endl;
// 释放设备内存
std::cout << "[Device " << deviceId << "] Freeing device memory..." << std::endl;
infinirtFree(deviceSrc);
infinirtFree(deviceDst);
std::cout << "[Device " << deviceId << "] Memory copy test PASSED!" << std::endl;
return true;
}
bool testSetDevice(infiniDevice_t device, int deviceId) {
std::cout << "Setting device " << device << " with ID: " << deviceId << std::endl;
infiniStatus_t status = infinirtSetDevice(device, deviceId);
if (status != INFINI_STATUS_SUCCESS) {
std::cerr << "Failed to set device " << device << " with ID " << deviceId << std::endl;
return false;
}
return true;
}
#ifndef __INFINIRT_TEST_H__
#define __INFINIRT_TEST_H__
#include "../utils.h"
bool testSetDevice(infiniDevice_t device, int deviceId);
bool testMemcpy(infiniDevice_t device, int deviceId, size_t dataSize);
#endif
......@@ -38,7 +38,7 @@ infiniStatus_t streamSynchronize(infinirtStream_t stream) {
}
infiniStatus_t streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
#ifdef ENABLE_ILUVATAR_CUDA_API
#ifdef ENABLE_ILUVATAR_API
return INFINI_STATUS_NOT_IMPLEMENTED;
#else
CHECK_CUDART(cudaStreamWaitEvent((cudaStream_t)stream, (cudaEvent_t)event));
......
......@@ -3,7 +3,7 @@
#include "../infinirt_impl.h"
namespace infinirt::cuda {
#ifdef ENABLE_CUDA_API
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
......
......@@ -5,7 +5,7 @@
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h"
#include "metax/infinirt_metax.h"
#include "musa/infinirt_musa.h"
thread_local infiniDevice_t CURRENT_DEVICE_TYPE = INFINI_DEVICE_CPU;
......@@ -23,6 +23,10 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
return INFINI_STATUS_NULL_POINTER;
}
for (size_t i = 0; i < INFINI_DEVICE_TYPE_COUNT; i++) {
if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_SUGON) {
count_array[i] = 0;
continue;
}
auto status = infinirtGetDeviceCount(static_cast<infiniDevice_t>(i), &count_array[i]);
if (status != INFINI_STATUS_SUCCESS) {
return status;
......@@ -62,7 +66,7 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
_status = infinirt::ascend::API PARAMS; \
break; \
case INFINI_DEVICE_METAX: \
_status = infinirt::maca::API PARAMS; \
_status = infinirt::metax::API PARAMS; \
break; \
case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \
......@@ -70,6 +74,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
case INFINI_DEVICE_ILUVATAR: \
_status = infinirt::cuda::API PARAMS; \
break; \
default: \
_status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment