Unverified Commit e4605f7c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #293 from YdrMaster/distinct-cuda

issue291 合并 cuda 代码
parents 5025ebed eac2b0ca
#include "gemm_maca.h"
#include "gemm_metax.h"
#include "../../../devices/maca/common_maca.h"
#include "../../../devices/maca/maca_handle.h"
namespace op::gemm::maca {
namespace op::gemm::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal;
......@@ -21,9 +21,7 @@ infiniStatus_t Descriptor::create(
auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
auto dtype = c_desc->dtype();
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result);
......@@ -53,7 +51,10 @@ infiniStatus_t Descriptor::calculate(
a_type = b_type = c_type = HPCC_R_16F;
compute_type = HCBLAS_COMPUTE_32F;
break;
case INFINI_DTYPE_BF16:
a_type = b_type = c_type = HPCC_R_16BF;
compute_type = HCBLAS_COMPUTE_32F;
break;
case INFINI_DTYPE_F32:
a_type = b_type = c_type = HPCC_R_32F;
compute_type = HCBLAS_COMPUTE_32F_FAST_TF32;
......@@ -103,4 +104,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS;
}
} // namespace op::gemm::maca
} // namespace op::gemm::metax
......@@ -3,6 +3,6 @@
#include "../gemm.h"
DESCRIPTOR(maca)
DESCRIPTOR(metax)
#endif // __GEMM_MACA_H__
......@@ -15,7 +15,7 @@
#include "ascend/gemm_ascend.h"
#endif
#ifdef ENABLE_METAX_API
#include "maca/gemm_maca.h"
#include "metax/gemm_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "musa/gemm_musa.h"
......@@ -55,7 +55,7 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, musa);
......@@ -97,7 +97,7 @@ infiniopGetGemmWorkspaceSize(
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, musa);
......@@ -146,7 +146,7 @@ __C infiniStatus_t infiniopGemm(
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, musa);
......@@ -185,7 +185,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, musa);
......
......@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(mul, cpu)
ELEMENTWISE_DESCRIPTOR(mul, cpu, cpu)
namespace op::mul::cpu {
typedef struct MulOp {
......
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
#include "../cuda/kernel.cuh"
#include "mul_nvidia.cuh"
namespace op::mul::cuda {
namespace op::mul::nvidia {
Descriptor::~Descriptor() = default;
......@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::cuda
} // namespace op::mul::nvidia
......@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(mul, cuda)
ELEMENTWISE_DESCRIPTOR(mul, nvidia, cuda)
#endif // __MUL_CUDA_API_H__
......@@ -7,7 +7,7 @@
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/mul_cuda.cuh"
#include "nvidia/mul_nvidia.cuh"
#endif
__C infiniStatus_t infiniopCreateMulDescriptor(
......@@ -32,7 +32,7 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
default:
......@@ -47,14 +47,14 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -84,7 +84,7 @@ __C infiniStatus_t infiniopMul(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
default:
......@@ -108,7 +108,7 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
default:
......
......@@ -4,7 +4,7 @@
#include <hccub/device/device_reduce.cuh>
#include <hccub/device/device_scan.cuh>
namespace op::random_sample::maca {
namespace op::random_sample::metax {
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
......@@ -256,4 +256,4 @@ struct Algo {
}
};
} // namespace op::random_sample::maca
} // namespace op::random_sample::metax
......@@ -3,6 +3,6 @@
#include "../random_sample.h"
DESCRIPTOR(maca)
DESCRIPTOR(metax)
#endif // __RANDOM_SAMPLE_MACA_H__
......@@ -2,9 +2,9 @@
#include "../../../devices/maca/maca_handle.h"
#include "../info.h"
#include "random_sample_kernel.h"
#include "random_sample_maca.h"
#include "random_sample_metax.h"
namespace op::random_sample::maca {
namespace op::random_sample::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal;
......
......@@ -9,7 +9,7 @@
#include "cuda/random_sample_cuda.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "maca/random_sample_maca.h"
#include "metax/random_sample_metax.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/random_sample_aclnn.h"
......@@ -39,7 +39,7 @@ infiniopCreateRandomSampleDescriptor(
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
......@@ -72,7 +72,7 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
GET(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend);
......@@ -115,7 +115,7 @@ __C infiniStatus_t infiniopRandomSample(
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
......@@ -145,7 +145,7 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
......
......@@ -3,6 +3,6 @@
#include "../rearrange.h"
DESCRIPTOR(maca)
DESCRIPTOR(metax)
#endif // __REARRANGE_MACA_H__
#include "../../../tensor.h"
#include "rearrange_kernel.h"
#include "rearrange_maca.h"
#include "rearrange_metax.h"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace op::rearrange::maca {
namespace op::rearrange::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal;
......@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate(
return status;
}
} // namespace op::rearrange::maca
} // namespace op::rearrange::metax
......@@ -13,7 +13,7 @@
#include "cuda/rearrange_cuda.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "maca/rearrange_maca.h"
#include "metax/rearrange_metax.h"
#endif
__C infiniStatus_t infiniopCreateRearrangeDescriptor(
......@@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -76,7 +76,7 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default:
......@@ -107,7 +107,7 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default:
......
......@@ -5,7 +5,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(relu, cpu)
ELEMENTWISE_DESCRIPTOR(relu, cpu, cpu)
namespace op::relu::cpu {
typedef struct ReluOp {
......
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
INFINIOP_CUDA_KERNEL rmsnormBlock(
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
__device__ void rmsnormBlock(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
......
#include "../../../devices/maca/common_maca.h"
#include "../cuda/rms_norm_kernel.cuh"
#include "rms_norm_maca.cuh"
#include "rms_norm_metax.cuh"
#include "../../../devices/maca/maca_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
INFINIOP_MACA_KERNEL rmsnormKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
ptrdiff_t stride_x,
const Tweight *__restrict__ w,
size_t dim,
float epsilon) {
rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
}
namespace op::rms_norm::maca {
......@@ -46,14 +64,14 @@ infiniStatus_t launchKernel(
float epsilon,
hcStream_t maca_stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
......@@ -91,8 +109,8 @@ infiniStatus_t Descriptor::calculate(
auto maca_stream = reinterpret_cast<hcStream_t>(stream);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
if (_opaque->internal->maxThreadsPerBlock() == MACA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MACA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment