Unverified Commit e4605f7c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #293 from YdrMaster/distinct-cuda

issue291 合并 cuda 代码
parents 5025ebed eac2b0ca
#include "swiglu_cuda.cuh"
#include "swiglu_cuda_internal.cuh"
#include "swiglu_nvidia.cuh"
namespace op::swiglu::cuda {
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
namespace op::swiglu::nvidia {
Descriptor::~Descriptor() = default;
......@@ -42,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::cuda
} // namespace op::swiglu::nvidia
......@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(swiglu, cuda)
ELEMENTWISE_DESCRIPTOR(swiglu, nvidia, cuda)
#endif // __SWIGLU_CUDA_API_H__
......@@ -6,13 +6,13 @@
#include "cpu/swiglu_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/swiglu_cuda.cuh"
#include "nvidia/swiglu_nvidia.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#include "maca/swiglu_maca.h"
#include "metax/swiglu_metax.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
......@@ -40,13 +40,13 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -83,20 +83,20 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
}
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
}
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
......@@ -143,13 +138,13 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -189,13 +184,13 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on CUDA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*
* Important Note: This is a device-independent header file containing reduce kernels
* for all cuda-supporting platforms. Include device-specific headers
* (such as <cub/block/block_reduce.cuh> for nvidia) in your source file
* and then include this file for proper usage.
*/
namespace op::common_cuda::reduce_op {
......
#ifndef __INFINIOP_REDUCE_MACA_H__
#define __INFINIOP_REDUCE_MACA_H__
#include <hccub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on MACA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*/
namespace op::common_maca::reduce_op {
// Sum(x^2) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
Tcompute ss = 0;
// Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
}
// Use CUB block-level reduction
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(ss);
}
// Sum(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
Tcompute s = 0;
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
s += Tcompute(data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(s);
}
// Max(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata>
__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
Tdata max_ = data_ptr[0];
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
max_ = cub::Max()(max_, data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
}
} // namespace op::common_maca::reduce_op
#endif
......@@ -108,7 +108,7 @@ option_end()
if has_config("metax-gpu") then
add_defines("ENABLE_METAX_API")
includes("xmake/maca.lua")
includes("xmake/metax.lua")
end
-- 摩尔线程
......@@ -174,7 +174,7 @@ target("infini-utils")
add_cxflags("-fPIC", "-Wno-unknown-pragmas")
if has_config("omp") then
add_cxflags("-fopenmp")
add_ldflags("-fopenmp")
add_ldflags("-fopenmp", {force = true})
end
end
......
......@@ -46,7 +46,7 @@ target("infiniop-cuda")
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17")
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../build/ninetoothed/*.c")
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c")
target_end()
target("infinirt-cuda")
......
......@@ -34,8 +34,8 @@ target("infiniop-metax")
set_languages("cxx17")
set_warnings("all", "error")
add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/metax/*.cc")
add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
target_end()
target("infinirt-metax")
......@@ -61,5 +61,5 @@ target("infiniccl-metax")
add_files("../src/infiniccl/maca/*.cc")
end
set_languages("cxx17")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment