Unverified Commit 6ec2ea40 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #865 from gongchensu/Issue/862

Issue/862 - Fix compilation errors (missing headers, cub namespace) t…
parents 012df56c 8d09630a
......@@ -9,7 +9,9 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#ifndef ENABLE_HYGON_API
#include <cuda_fp8.h>
#endif
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/add_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#include "nvidia/add_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -54,6 +54,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -97,6 +100,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
......@@ -148,6 +154,9 @@ __C infiniStatus_t infiniopAdd(
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
......@@ -193,6 +202,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
......
......@@ -56,6 +56,9 @@ __device__ void logSoftmaxKernel(
}
#if CUDART_VERSION >= 12090
max_val = BlockReduce(temp_storage).Reduce(max_val, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
max_val = BlockReduce(temp_storage).Reduce(
max_val, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
max_val = BlockReduce(temp_storage).Reduce(max_val, cub::Max());
#endif
......
......@@ -19,6 +19,9 @@ __device__ void blockLPNormKernel(
__shared__ float global_max;
#if CUDART_VERSION >= 12090
float max_block = BlockReduce(temp_storage).Reduce(local_max, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
float max_block = BlockReduce(temp_storage).Reduce(
local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
float max_block = BlockReduce(temp_storage).Reduce(local_max, cub::Max());
#endif
......@@ -75,6 +78,9 @@ __device__ void blockLPNormStridesKernel(
__shared__ float global_max;
#if CUDART_VERSION >= 12090
float max_block = BlockReduce(temp_storage).Reduce(local_max, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
float max_block = BlockReduce(temp_storage).Reduce(
local_max, [](const float &a, const float &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
float max_block = BlockReduce(temp_storage).Reduce(local_max, cub::Max());
#endif
......
......@@ -27,8 +27,10 @@ public:
return 1;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 1;
#ifndef ENABLE_HYGON_API
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(1.0f);
#endif
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(1.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
......
......@@ -79,8 +79,10 @@ infiniStatus_t Descriptor::calculate(
return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
#ifndef ENABLE_HYGON_API
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
#endif
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
......
......@@ -27,6 +27,15 @@ struct CustomLess {
}
};
// Warp-level sum reduction for Hygon platform
template <int warp_threads>
__inline__ __device__ float WarpSum(float val) {
for (int mask = warp_threads / 2; mask > 0; mask /= 2) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
}
template <typename T, int BLOCK_THREADS = 256>
__global__ void topkrouter_kernel(float *values_topk, // 输出数据, 形状[N, topk]
int *indices_topk, // 输出索引, 形状[N, topk]
......@@ -137,12 +146,19 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出数
value = sigmoid_func(data_input[index]);
}
{
#ifdef ENABLE_HYGON_API
float warp_sum = WarpSum<warp_threads>(value);
if (0 == tid) {
share_sum = warp_sum + 1e-9f;
}
#else
typedef cub::WarpReduce<float, warp_threads> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
float warp_sum = WarpReduce(temp_storage).Sum(value);
if (0 == tid) {
share_sum = warp_sum + 1e-9f;
}
#endif
}
__syncwarp();
......
......@@ -19,6 +19,15 @@ inline __device__ float exp_func(T x) {
return __expf(data);
}
// Warp-level sum reduction for Hygon platform
template <int warp_threads>
__inline__ __device__ float WarpSum(float val) {
for (int mask = warp_threads / 2; mask > 0; mask /= 2) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
}
template <typename T, int BLOCK_SIZE = 128>
__global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形状[N, topk]
int *indices_topk, // 输出索引, 形状[N, topk]
......@@ -57,6 +66,9 @@ __global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形
__shared__ typename BlockReduce::TempStorage temp_storage_max;
#if CUDART_VERSION >= 12090
T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, ::cuda::maximum());
#elif defined(ENABLE_HYGON_API)
T value_max = BlockReduce(temp_storage_max).Reduce(
thread_max, [](const T &a, const T &b) { return (a > b) ? a : b; }, BLOCK_SIZE);
#else
T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max());
#endif
......@@ -117,12 +129,19 @@ __global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形
// 第五步: topk的和 //
// ------------------------------------------------ //
{
#ifdef ENABLE_HYGON_API
float warp_sum = WarpSum<32>(value);
if (0 == tid) {
shared_sum = warp_sum + 1e-9f;
}
#else
typedef cub::WarpReduce<float, 32> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
float warp_sum = WarpReduce(temp_storage).Sum(value);
if (0 == tid) {
shared_sum = warp_sum + 1e-9f;
}
#endif
}
__syncwarp();
......
......@@ -27,8 +27,10 @@ public:
return 0;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 0;
#ifndef ENABLE_HYGON_API
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(0.0f);
#endif
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(0.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
......
......@@ -79,8 +79,10 @@ infiniStatus_t Descriptor::calculate(
return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
#ifndef ENABLE_HYGON_API
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
#endif
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
......
......@@ -63,18 +63,13 @@ target("infiniop-hygon")
add_cxxflags("-fPIC")
-- 添加海光DCU特定的编译标志
add_cuflags("-arch=gfx906", "-arch=gfx926", "-arch=gfx928", "-arch=gfx936")
-- 检测实际GPU架构,如果未指定则默认使用gfx906
local hygon_arch = os.getenv("HYGON_ARCH") or "gfx906"
add_cuflags("-arch=" .. hygon_arch)
print("编译海光DCU架构: " .. hygon_arch)
-- 复用NVIDIA的CUDA实现,通过HIP兼容层
-- 只编译海光DCU支持的7个算子:rope, gemm, causal_softmax, random_sample, rearrange, rms_norm, swiglu
add_files("../src/infiniop/devices/nvidia/*.cu")
add_files("../src/infiniop/ops/rope/nvidia/*.cu")
add_files("../src/infiniop/ops/gemm/nvidia/*.cu")
add_files("../src/infiniop/ops/causal_softmax/nvidia/*.cu")
add_files("../src/infiniop/ops/random_sample/nvidia/*.cu")
add_files("../src/infiniop/ops/rearrange/nvidia/*.cu")
add_files("../src/infiniop/ops/rms_norm/nvidia/*.cu")
add_files("../src/infiniop/ops/swiglu/nvidia/*.cu")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp", {cxxflags = {"-Wno-return-type"}})
......@@ -109,7 +104,9 @@ target("infinirt-hygon")
add_cxxflags("-fPIC")
-- 添加海光DCU特定的编译标志
add_cuflags("-arch=gfx906", "-arch=gfx926", "-arch=gfx928", "-arch=gfx936")
-- 检测实际GPU架构,如果未指定则默认使用gfx906
local hygon_arch = os.getenv("HYGON_ARCH") or "gfx906"
add_cuflags("-arch=" .. hygon_arch)
add_files("../src/infinirt/cuda/*.cu")
target_end()
......@@ -143,7 +140,9 @@ target("infiniccl-hygon")
add_cxxflags("-fPIC")
-- 添加海光DCU特定的编译标志
add_cuflags("-arch=gfx906", "-arch=gfx926", "-arch=gfx928", "-arch=gfx936")
-- 检测实际GPU架构,如果未指定则默认使用gfx906
local hygon_arch = os.getenv("HYGON_ARCH") or "gfx906"
add_cuflags("-arch=" .. hygon_arch)
-- 使用NCCL (NVIDIA Collective Communications Library)
add_links("nccl")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment