Unverified Commit 0166515c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge branch 'main' into issue/300

parents f0300ff3 a23c4d13
#include "../../../devices/maca/maca_kernel_common.h" #include "../../../devices/metax/metax_kernel_common.h"
#include "infinicore.h" #include "infinicore.h"
#include <hccub/device/device_radix_sort.cuh> #include <hccub/device/device_radix_sort.cuh>
#include <hccub/device/device_reduce.cuh> #include <hccub/device/device_reduce.cuh>
#include <hccub/device/device_scan.cuh> #include <hccub/device/device_scan.cuh>
namespace op::random_sample::maca { namespace op::random_sample::metax {
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用 // ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
...@@ -62,7 +62,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) { ...@@ -62,7 +62,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
const auto n = static_cast<int>(n_); const auto n = static_cast<int>(n_);
size_t argmax; size_t argmax;
CHECK_MACA(argMax_<Tval>( CHECK_METAX(argMax_<Tval>(
nullptr, nullptr, n, nullptr, nullptr, n,
nullptr, argmax, nullptr, argmax,
nullptr)); nullptr));
...@@ -77,7 +77,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) { ...@@ -77,7 +77,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
size_random += align256(sizeof(Tidx) * n); size_random += align256(sizeof(Tidx) * n);
// cub device api // cub device api
size_t size_radix_sort; size_t size_radix_sort;
CHECK_MACA((radixSort<Tval, Tidx>( CHECK_METAX((radixSort<Tval, Tidx>(
nullptr, size_radix_sort, nullptr, size_radix_sort,
nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr,
...@@ -85,7 +85,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) { ...@@ -85,7 +85,7 @@ utils::Result<size_t> calculateWorkspace(size_t n_) {
nullptr))); nullptr)));
size_t size_inclusive_sum; size_t size_inclusive_sum;
CHECK_MACA(inclusiveSum<Tval>( CHECK_METAX(inclusiveSum<Tval>(
nullptr, size_inclusive_sum, nullptr, size_inclusive_sum,
nullptr, n, nullptr, n,
nullptr)); nullptr));
...@@ -107,6 +107,11 @@ struct CudaTval<fp16_t> { ...@@ -107,6 +107,11 @@ struct CudaTval<fp16_t> {
using Type = half; using Type = half;
}; };
template <>
struct CudaTval<bf16_t> {
using Type = __hpcc_bfloat16;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half // ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel // ↓↓↓ 用于采样过程的小型 kernel
...@@ -228,7 +233,7 @@ struct Algo { ...@@ -228,7 +233,7 @@ struct Algo {
auto grid = (n + block - 1) / block; auto grid = (n + block - 1) / block;
// sort // sort
fillIndices<<<grid, block, 0, stream>>>(indices, n); fillIndices<<<grid, block, 0, stream>>>(indices, n);
CHECK_MACA(radixSort( CHECK_METAX(radixSort(
workspace_, workspace_size, workspace_, workspace_size,
logits, sorted, logits, sorted,
indices, indices_out, indices, indices_out,
...@@ -238,7 +243,7 @@ struct Algo { ...@@ -238,7 +243,7 @@ struct Algo {
partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature); partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature);
setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted); setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
// sum // sum
CHECK_MACA(inclusiveSum( CHECK_METAX(inclusiveSum(
workspace_, workspace, workspace_, workspace,
sorted, n, sorted, n,
stream)); stream));
...@@ -251,4 +256,4 @@ struct Algo { ...@@ -251,4 +256,4 @@ struct Algo {
} }
}; };
} // namespace op::random_sample::maca } // namespace op::random_sample::metax
#ifndef __RANDOM_SAMPLE_METAX_H__
#define __RANDOM_SAMPLE_METAX_H__
#include "../random_sample.h"
DESCRIPTOR(metax)
#endif // __RANDOM_SAMPLE_METAX_H__
#include "../../../devices/maca/common_maca.h" #include "../../../devices/metax/metax_common.h"
#include "../../../devices/maca/maca_handle.h" #include "../../../devices/metax/metax_handle.h"
#include "../info.h" #include "../info.h"
#include "random_sample_kernel.h" #include "random_sample_kernel.h"
#include "random_sample_maca.h" #include "random_sample_metax.h"
namespace op::random_sample::maca { namespace op::random_sample::metax {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal; std::shared_ptr<device::metax::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( ...@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc, infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) { infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::maca::Handle *>(handle_); auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc); auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result); CHECK_RESULT(result);
...@@ -34,15 +34,16 @@ infiniStatus_t Descriptor::create( ...@@ -34,15 +34,16 @@ infiniStatus_t Descriptor::create(
workspace_size = workspace_result.take(); \ workspace_size = workspace_result.take(); \
} break } break
#define CASE_I(CASE, Tidx) \ #define CASE_I(CASE, Tidx) \
case CASE: \ case CASE: \
switch (info.dt_p) { \ switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \ CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \ CASE_P(INFINI_DTYPE_BF16, Tidx, __hpcc_bfloat16); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \ CASE_P(INFINI_DTYPE_F32, Tidx, float); \
default: \ CASE_P(INFINI_DTYPE_F64, Tidx, double); \
abort(); \ default: \
} \ abort(); \
} \
break break
switch (info.dt_i) { switch (info.dt_i) {
...@@ -99,4 +100,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -99,4 +100,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::random_sample::maca } // namespace op::random_sample::metax
#include "../../../devices/cuda/cuda_kernel_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "infinicore.h" #include "infinicore.h"
#include <cub/device/device_radix_sort.cuh> #include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh> #include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh> #include <cub/device/device_scan.cuh>
namespace op::random_sample::cuda { namespace op::random_sample::nvidia {
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用 // ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
...@@ -193,7 +193,7 @@ struct Algo { ...@@ -193,7 +193,7 @@ struct Algo {
argMax_( argMax_(
kv_pair, kv_pair,
logits, logits,
n, static_cast<int>(n),
workspace, workspace,
workspace_size, stream); workspace_size, stream);
castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair); castIdx<<<1, 1, 0, stream>>>((Tidx *)result, kv_pair);
...@@ -232,20 +232,20 @@ struct Algo { ...@@ -232,20 +232,20 @@ struct Algo {
auto block = cub::Min()((size_t)block_size, n); auto block = cub::Min()((size_t)block_size, n);
auto grid = (n + block - 1) / block; auto grid = (n + block - 1) / block;
// sort // sort
fillIndices<<<grid, block, 0, stream>>>(indices, n); fillIndices<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(indices, static_cast<int>(n));
CHECK_CUDA(radixSort( CHECK_CUDA(radixSort(
workspace_, workspace_size, workspace_, workspace_size,
logits, sorted, logits, sorted,
indices, indices_out, indices, indices_out,
n, static_cast<int>(n),
stream)); stream));
// softmax // softmax
partialSoftmaxKernel<<<grid, block, 0, stream>>>(sorted, n, temperature); partialSoftmaxKernel<<<static_cast<unsigned int>(grid), static_cast<unsigned int>(block), 0, stream>>>(sorted, static_cast<int>(n), temperature);
setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted); setSoftmaxMaxKernel<<<1, 1, 0, stream>>>(sorted);
// sum // sum
CHECK_CUDA(inclusiveSum( CHECK_CUDA(inclusiveSum(
workspace_, workspace, workspace_, workspace,
sorted, n, sorted, static_cast<int>(n),
stream)); stream));
// sample // sample
randomSampleKernel<<<1, 1, 0, stream>>>( randomSampleKernel<<<1, 1, 0, stream>>>(
...@@ -256,4 +256,4 @@ struct Algo { ...@@ -256,4 +256,4 @@ struct Algo {
} }
}; };
} // namespace op::random_sample::cuda } // namespace op::random_sample::nvidia
#include "../../../devices/cuda/cuda_handle.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../info.h" #include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh" #include "random_sample_kernel.cuh"
#include "random_sample_nvidia.cuh"
namespace op::random_sample::cuda { namespace op::random_sample::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::nvidia::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -18,7 +18,7 @@ infiniStatus_t Descriptor::create( ...@@ -18,7 +18,7 @@ infiniStatus_t Descriptor::create(
Descriptor **desc_ptr, Descriptor **desc_ptr,
infiniopTensorDescriptor_t result_desc, infiniopTensorDescriptor_t result_desc,
infiniopTensorDescriptor_t probs_desc) { infiniopTensorDescriptor_t probs_desc) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_); auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto result = RandomSampleInfo::create(result_desc, probs_desc); auto result = RandomSampleInfo::create(result_desc, probs_desc);
CHECK_RESULT(result); CHECK_RESULT(result);
...@@ -99,4 +99,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -99,4 +99,4 @@ infiniStatus_t Descriptor::calculate(
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::random_sample::cuda } // namespace op::random_sample::nvidia
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
#include "../random_sample.h" #include "../random_sample.h"
DESCRIPTOR(cuda) DESCRIPTOR(nvidia)
#endif // __RANDOM_SAMPLE_CUDA_CUH__ #endif // __RANDOM_SAMPLE_CUDA_CUH__
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h" #include "cpu/random_sample_cpu.h"
#endif #endif
#ifdef ENABLE_CUDA_API #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "cuda/random_sample_cuda.cuh" #include "nvidia/random_sample_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/random_sample_maca.h" #include "metax/random_sample_metax.h"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/random_sample_aclnn.h" #include "ascend/random_sample_aclnn.h"
...@@ -35,11 +35,14 @@ infiniopCreateRandomSampleDescriptor( ...@@ -35,11 +35,14 @@ infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca); CREATE(INFINI_DEVICE_METAX, metax);
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend); CREATE(INFINI_DEVICE_ASCEND, ascend);
...@@ -68,11 +71,14 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize( ...@@ -68,11 +71,14 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu); GET(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda); GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca); GET(INFINI_DEVICE_METAX, metax);
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend); GET(INFINI_DEVICE_ASCEND, ascend);
...@@ -111,11 +117,14 @@ __C infiniStatus_t infiniopRandomSample( ...@@ -111,11 +117,14 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca); CALCULATE(INFINI_DEVICE_METAX, metax);
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend); CALCULATE(INFINI_DEVICE_ASCEND, ascend);
...@@ -141,11 +150,14 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor( ...@@ -141,11 +150,14 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca); DELETE(INFINI_DEVICE_METAX, metax);
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend); DELETE(INFINI_DEVICE_ASCEND, ascend);
......
#ifndef __REARRANGE_MACA_H__
#define __REARRANGE_MACA_H__
#include "../rearrange.h"
DESCRIPTOR(maca)
#endif // __REARRANGE_MACA_H__
#ifndef __REARRANGE_MACA_KERNEL_H__ #ifndef __REARRANGE_METAX_KERNEL_H__
#define __REARRANGE_MACA_KERNEL_H__ #define __REARRANGE_METAX_KERNEL_H__
#include "../../../devices/maca/common_maca.h" #include "../../../devices/metax/metax_common.h"
#include "../../../devices/maca/maca_kernel_common.h" #include "../../../devices/metax/metax_kernel_common.h"
#define ARRAY_TYPE_STRIDE ptrdiff_t #define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t #define ARRAY_TYPE_SIZE size_t
...@@ -328,4 +328,4 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) { ...@@ -328,4 +328,4 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
return utils::Result<void *>(kernel_func); return utils::Result<void *>(kernel_func);
} }
#endif // __REARRANGE_MACA_KERNEL_H__ #endif // __REARRANGE_METAX_KERNEL_H__
#ifndef __REARRANGE_METAX_H__
#define __REARRANGE_METAX_H__
#include "../rearrange.h"
DESCRIPTOR(metax)
#endif // __REARRANGE_METAX_H__
#include "../../../tensor.h" #include "../../../tensor.h"
#include "rearrange_kernel.h" #include "rearrange_kernel.h"
#include "rearrange_maca.h" #include "rearrange_metax.h"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <memory> #include <memory>
#include <stdint.h> #include <stdint.h>
#include <vector> #include <vector>
namespace op::rearrange::maca { namespace op::rearrange::metax {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal; std::shared_ptr<device::metax::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -47,7 +47,7 @@ infiniStatus_t Descriptor::create( ...@@ -47,7 +47,7 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
std::move(*meta), std::move(*meta),
new Opaque{reinterpret_cast<device::maca::Handle *>(handle)->internal()}, new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -429,18 +429,18 @@ infiniStatus_t launchKernel( ...@@ -429,18 +429,18 @@ infiniStatus_t launchKernel(
infiniStatus_t Descriptor::calculate( infiniStatus_t Descriptor::calculate(
void *y, void *y,
const void *x, const void *x,
void *stream) const { void *stream_) const {
auto maca_stream = reinterpret_cast<hcStream_t>(stream); auto stream = reinterpret_cast<hcStream_t>(stream_);
// 如果没有维度,直接进行内存拷贝 // 如果没有维度,直接进行内存拷贝
if (_meta.ndim() == 0) { if (_meta.ndim() == 0) {
auto err = hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, maca_stream); auto err = hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, stream);
if (err != hcSuccess) { if (err != hcSuccess) {
return INFINI_STATUS_INTERNAL_ERROR; return INFINI_STATUS_INTERNAL_ERROR;
} }
CHECK_OR_RETURN(hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, maca_stream) == hcSuccess, CHECK_OR_RETURN(hcMemcpyAsync(y, x, _meta.unit(), hcMemcpyDeviceToDevice, stream) == hcSuccess,
INFINI_STATUS_INTERNAL_ERROR); INFINI_STATUS_INTERNAL_ERROR);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -449,7 +449,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -449,7 +449,7 @@ infiniStatus_t Descriptor::calculate(
int max_threads = _opaque->internal->maxThreadsPerBlock(); int max_threads = _opaque->internal->maxThreadsPerBlock();
// 准备参数 // 准备参数
auto params_result = prepareRearrangeParams(_meta, std::min(MACA_BLOCK_SIZE_1024, max_threads)); auto params_result = prepareRearrangeParams(_meta, std::min(METAX_BLOCK_SIZE_1024, max_threads));
CHECK_RESULT(params_result); CHECK_RESULT(params_result);
auto params = params_result.take(); auto params = params_result.take();
...@@ -469,10 +469,10 @@ infiniStatus_t Descriptor::calculate( ...@@ -469,10 +469,10 @@ infiniStatus_t Descriptor::calculate(
size_t block_size = params.block_len_total; size_t block_size = params.block_len_total;
if (block_size <= MACA_BLOCK_SIZE_512) { if (block_size <= METAX_BLOCK_SIZE_512) {
status = launchKernel<MACA_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(), maca_stream); status = launchKernel<METAX_BLOCK_SIZE_512>(y, x, grid_size, params, _meta.unit(), stream);
} else if (block_size <= MACA_BLOCK_SIZE_1024) { } else if (block_size <= METAX_BLOCK_SIZE_1024) {
status = launchKernel<MACA_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), maca_stream); status = launchKernel<METAX_BLOCK_SIZE_1024>(y, x, grid_size, params, _meta.unit(), stream);
} else { } else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
} }
...@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate(
return status; return status;
} }
} // namespace op::rearrange::maca } // namespace op::rearrange::metax
#ifndef __REARRANGE_CUDA_KERNEL_H__ #ifndef __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__ #define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/nvidia/nvidia_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t #define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t #define ARRAY_TYPE_SIZE size_t
...@@ -63,13 +63,13 @@ struct Constraint { ...@@ -63,13 +63,13 @@ struct Constraint {
size_t remaining \ size_t remaining \
= blockIdx.x; \ = blockIdx.x; \
\ \
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \ size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \ remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \ dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \ if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \ if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \ constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \ } \
...@@ -80,7 +80,7 @@ struct Constraint { ...@@ -80,7 +80,7 @@ struct Constraint {
/* 将结果存入共享内存 */ \ /* 将结果存入共享内存 */ \
shared_src_offset = src_offset; \ shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \ shared_dst_offset = dst_offset; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \ shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \ } \
} \ } \
...@@ -92,18 +92,18 @@ struct Constraint { ...@@ -92,18 +92,18 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \ ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \ constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \ } \
\ \
for (ssize_t i = block_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \ size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \ remaining /= block_len.a[i]; \
/* 计算偏移量 */ \ /* 计算偏移量 */ \
src_offset += idx * src_block_stride.a[i]; \ src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \ dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \ if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \ if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \ if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \ return; \
...@@ -115,7 +115,7 @@ struct Constraint { ...@@ -115,7 +115,7 @@ struct Constraint {
\ \
src_offset += remaining * src_block_stride.a[0]; \ src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \ dst_offset += remaining * dst_block_stride.a[0]; \
for (ssize_t j = 0; j < constraint_num; j++) { \ for (ptrdiff_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \ if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \ if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \ return; \
...@@ -133,7 +133,7 @@ struct Constraint { ...@@ -133,7 +133,7 @@ struct Constraint {
ptrdiff_t dst_offset = 0; \ ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \ size_t remaining = blockIdx.x; \
\ \
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \ for (ptrdiff_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \ size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \ remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \ src_offset += idx * src_grid_stride.a[i]; \
...@@ -152,7 +152,7 @@ struct Constraint { ...@@ -152,7 +152,7 @@ struct Constraint {
ptrdiff_t src_offset = shared_src_offset; \ ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \ ptrdiff_t dst_offset = shared_dst_offset; \
\ \
for (ssize_t i = block_array_size - 1; i > 0; i--) { \ for (ptrdiff_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \ size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \ remaining /= block_len.a[i]; \
/* 计算偏移量 */ \ /* 计算偏移量 */ \
......
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../../../tensor.h" #include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh" #include "rearrange_kernel.cuh"
#include "rearrange_nvidia.cuh"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <memory> #include <memory>
#include <stdint.h> #include <stdint.h>
#include <vector> #include <vector>
namespace op::rearrange::cuda { namespace op::rearrange::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::nvidia::Handle::Internal> internal;
}; };
Descriptor::~Descriptor() { Descriptor::~Descriptor() {
...@@ -49,7 +49,7 @@ infiniStatus_t Descriptor::create( ...@@ -49,7 +49,7 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
std::move(*meta), std::move(*meta),
new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()}, new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta ...@@ -297,7 +297,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
block_len.push_back(split_dims[j].num_per_block); block_len.push_back(split_dims[j].num_per_block);
src_block_stride.push_back(dims[i].src_stride); src_block_stride.push_back(dims[i].src_stride);
dst_block_stride.push_back(dims[i].dst_stride); dst_block_stride.push_back(dims[i].dst_stride);
split_dims[j].array_struct_idx_block = block_dim; split_dims[j].array_struct_idx_block = static_cast<int>(block_dim);
block_dim += 1; block_dim += 1;
block_len_total *= split_dims[j].num_per_block; block_len_total *= split_dims[j].num_per_block;
} }
...@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta ...@@ -316,7 +316,7 @@ utils::Result<RearrangeParams> prepareRearrangeParams(const utils::RearrangeMeta
grid_len.push_back(split_dims[j].num_per_grid); grid_len.push_back(split_dims[j].num_per_grid);
src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block); src_grid_stride.push_back(dims[i].src_stride * split_dims[j].num_per_block);
dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block); dst_grid_stride.push_back(dims[i].dst_stride * split_dims[j].num_per_block);
split_dims[j].array_struct_idx_grid = grid_len.size() - 1; split_dims[j].array_struct_idx_grid = static_cast<int>(grid_len.size() - 1);
} }
} }
...@@ -420,7 +420,7 @@ infiniStatus_t launchKernel( ...@@ -420,7 +420,7 @@ infiniStatus_t launchKernel(
CHECK_OR_RETURN(cudaLaunchKernel( CHECK_OR_RETURN(cudaLaunchKernel(
kernel_func, kernel_func,
grid_size, BLOCK_SIZE, static_cast<unsigned int>(grid_size), static_cast<unsigned int>(BLOCK_SIZE),
args, 0, stream) args, 0, stream)
== cudaSuccess, == cudaSuccess,
INFINI_STATUS_INTERNAL_ERROR); INFINI_STATUS_INTERNAL_ERROR);
...@@ -482,4 +482,4 @@ infiniStatus_t Descriptor::calculate( ...@@ -482,4 +482,4 @@ infiniStatus_t Descriptor::calculate(
return status; return status;
} }
} // namespace op::rearrange::cuda } // namespace op::rearrange::nvidia
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
#include "../rearrange.h" #include "../rearrange.h"
DESCRIPTOR(cuda) DESCRIPTOR(nvidia)
#endif // __REARRANGE_CUDA_H__ #endif // __REARRANGE_CUDA_H__
...@@ -8,12 +8,11 @@ ...@@ -8,12 +8,11 @@
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h" #include "ascend/rearrange_ascend.h"
#endif #endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#ifdef ENABLE_CUDA_API #include "nvidia/rearrange_nvidia.cuh"
#include "cuda/rearrange_cuda.cuh"
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/rearrange_maca.h" #include "metax/rearrange_metax.h"
#endif #endif
__C infiniStatus_t infiniopCreateRearrangeDescriptor( __C infiniStatus_t infiniopCreateRearrangeDescriptor(
...@@ -39,11 +38,14 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor( ...@@ -39,11 +38,14 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE(INFINI_DEVICE_ASCEND, ascend); CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca); CREATE(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -72,11 +74,14 @@ __C infiniStatus_t infiniopRearrange( ...@@ -72,11 +74,14 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE(INFINI_DEVICE_ASCEND, ascend); CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca); CALCULATE(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
...@@ -102,12 +107,14 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor( ...@@ -102,12 +107,14 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend); DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif #endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_CUDA_API DELETE(INFINI_DEVICE_NVIDIA, nvidia);
DELETE(INFINI_DEVICE_NVIDIA, cuda); #endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca); DELETE(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
......
#include "relu_cpu.h"
namespace op::relu::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<ReluOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<ReluOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<ReluOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<ReluOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::cpu
#ifndef __RELU_CPU_H__
#define __RELU_CPU_H__
#include <algorithm>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(relu, cpu)
namespace op::relu::cpu {
typedef struct ReluOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return std::max<T>(x, 0);
}
} ReluOp;
} // namespace op::relu::cpu
#endif // __RELU_CPU_H__
#ifndef __RELU_METAX_API_H__
#define __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(relu, metax)
#endif
#endif // __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/metax/metax_common.h"
#include "relu_metax.h"
namespace op::relu::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
auto x_data{const_cast<void *>(inputs[0])};
auto x_shape{x_shape_vec.data()};
auto x_strides{x_strides_vec.data()};
const NineToothedTensor x{x_data, x_shape, x_strides};
const auto &y_shape_{_info.getOutputShape()};
const auto &y_strides_{_info.getOutputStrides()};
std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
auto y_data{output};
auto y_shape{y_shape_vec.data()};
auto y_strides{y_strides_vec.data()};
const NineToothedTensor y{y_data, y_shape, y_strides};
constexpr auto block_size{1024};
switch (_dtype) {
case INFINI_DTYPE_F16:
case INFINI_DTYPE_F32:
case INFINI_DTYPE_F64:
case INFINI_DTYPE_BF16:
if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::metax
#endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment