Commit f46e9f65 authored by zhangyue's avatar zhangyue
Browse files

issue/1008: adapt lpnorm layernorm softmax rearrange paged_attention for iluvatar

parent bd0c922a
......@@ -20,9 +20,9 @@ def run_tests(args):
#"dequantize_awq.py",
"gelu.py",
"gemm.py",
#"layer_norm.py",
"layer_norm.py",
"logsoftmax.py",
#"lp_norm.py",
"lp_norm.py",
"mul.py",
"ones.py",
"random_sample.py",
......@@ -31,7 +31,7 @@ def run_tests(args):
"rms_norm.py",
"rope.py",
"sigmoid.py",
#"softmax.py",
"softmax.py",
"softplus.py",
"sub.py",
"swiglu.py",
......@@ -39,6 +39,9 @@ def run_tests(args):
"topkrouter.py",
"topksoftmax.py",
"zeros.py",
"paged_attention.py",
"paged_caching.py",
"paged_attention_prefill.py"
]:
result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
......@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
_info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
_info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
......
......@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
......@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// GET(INFINI_DEVICE_ILUVATAR, nvidia);
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
......@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
......@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
// DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia);
......
......@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
......
......@@ -8,8 +8,8 @@
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE 5
#define MAX_GRID_ARRAY_SIZE 5
#define MAX_BLOCK_ARRAY_SIZE 6
#define MAX_GRID_ARRAY_SIZE 6
template <int ArrSize, typename ArrayType>
struct ArrayStruct {
......@@ -185,32 +185,43 @@ struct Constraint {
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-5和1-5的所有组合生成内核
// 为1-6和1-6的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT(1, 1)
DEFINE_KERNELS_BY_CONSTRAINT(1, 2)
DEFINE_KERNELS_BY_CONSTRAINT(1, 3)
DEFINE_KERNELS_BY_CONSTRAINT(1, 4)
DEFINE_KERNELS_BY_CONSTRAINT(1, 5)
DEFINE_KERNELS_BY_CONSTRAINT(1, 6)
DEFINE_KERNELS_BY_CONSTRAINT(2, 1)
DEFINE_KERNELS_BY_CONSTRAINT(2, 2)
DEFINE_KERNELS_BY_CONSTRAINT(2, 3)
DEFINE_KERNELS_BY_CONSTRAINT(2, 4)
DEFINE_KERNELS_BY_CONSTRAINT(2, 5)
DEFINE_KERNELS_BY_CONSTRAINT(2, 6)
DEFINE_KERNELS_BY_CONSTRAINT(3, 1)
DEFINE_KERNELS_BY_CONSTRAINT(3, 2)
DEFINE_KERNELS_BY_CONSTRAINT(3, 3)
DEFINE_KERNELS_BY_CONSTRAINT(3, 4)
DEFINE_KERNELS_BY_CONSTRAINT(3, 5)
DEFINE_KERNELS_BY_CONSTRAINT(3, 6)
DEFINE_KERNELS_BY_CONSTRAINT(4, 1)
DEFINE_KERNELS_BY_CONSTRAINT(4, 2)
DEFINE_KERNELS_BY_CONSTRAINT(4, 3)
DEFINE_KERNELS_BY_CONSTRAINT(4, 4)
DEFINE_KERNELS_BY_CONSTRAINT(4, 5)
DEFINE_KERNELS_BY_CONSTRAINT(4, 6)
DEFINE_KERNELS_BY_CONSTRAINT(5, 1)
DEFINE_KERNELS_BY_CONSTRAINT(5, 2)
DEFINE_KERNELS_BY_CONSTRAINT(5, 3)
DEFINE_KERNELS_BY_CONSTRAINT(5, 4)
DEFINE_KERNELS_BY_CONSTRAINT(5, 5)
DEFINE_KERNELS_BY_CONSTRAINT(5, 6)
DEFINE_KERNELS_BY_CONSTRAINT(6, 1)
DEFINE_KERNELS_BY_CONSTRAINT(6, 2)
DEFINE_KERNELS_BY_CONSTRAINT(6, 3)
DEFINE_KERNELS_BY_CONSTRAINT(6, 4)
DEFINE_KERNELS_BY_CONSTRAINT(6, 5)
DEFINE_KERNELS_BY_CONSTRAINT(6, 6)
// 准备参数结构体
struct RearrangeParams {
......@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
......@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
case 6: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \
break; \
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/sigmoid_nvidia.cuh"
#endif
......@@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -65,7 +68,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -101,7 +106,9 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......@@ -131,7 +138,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_ALI_API
DELETE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
......
......@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/topksoftmax_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/topksoftmax_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
......@@ -36,6 +36,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
#endif
#ifdef ENABLE_ALI_API
CREATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
#endif
#ifdef ENABLE_ALI_API
GET(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -101,6 +107,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
#endif
#ifdef ENABLE_ALI_API
CALCULATE(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......@@ -131,6 +140,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
#endif
#ifdef ENABLE_ALI_API
DESTROY(INFINI_DEVICE_ALI, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
}
......
......@@ -15,6 +15,7 @@ from libinfiniop import (
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
InfiniDeviceEnum,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -112,6 +113,12 @@ def test(
dtype=None,
sync=None,
):
# Skip strided cases on Iluvatar: Ones with non-contiguous tensors can hang the GPU (requires ixsmi -r to recover)
if device == InfiniDeviceEnum.ILUVATAR and (
x_stride is not None or y_stride is not None
):
return
if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
x = TestTensor(shape, x_stride, dtype, device)
elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
......
......@@ -15,6 +15,7 @@ from libinfiniop import (
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
InfiniDeviceEnum,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -114,6 +115,12 @@ def test(
dtype=None,
sync=None,
):
# Skip strided cases on Iluvatar: Zeros with non-contiguous tensors can hang the GPU (requires ixsmi -r to recover)
if device == InfiniDeviceEnum.ILUVATAR and (
x_stride is not None or y_stride is not None
):
return
if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
x = TestTensor(shape, x_stride, dtype, device)
elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
......
......@@ -115,10 +115,12 @@ option("iluvatar-gpu")
set_description("Whether to compile implementations for Iluvatar GPU")
option_end()
option("ivcore-20")
set_default(false)
option("iluvatar_arch")
set_default("ivcore20")
set_showmenu(true)
set_description("Use ivcore20")
set_description("Set Iluvatar GPU architecture (e.g. ivcore20)")
set_values("ivcore20")
set_category("option")
option_end()
if has_config("iluvatar-gpu") then
......
toolchain("iluvatar.toolchain")
local iluvatar_arch = get_config("iluvatar_arch") or "ivcore20"
toolchain("iluvatar.toolchain")
set_toolset("cc" , "clang" )
set_toolset("cxx" , "clang++")
set_toolset("cu" , "clang++")
......@@ -44,9 +46,7 @@ target("infiniop-iluvatar")
set_warnings("all", "error")
add_cuflags("-Wno-error=unused-private-field", "-Wno-error=unused-variable", "-Wno-unused-variable")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
if has_config("ivcore-20") then
add_cuflags("--cuda-gpu-arch=ivcore20", {force = true})
end
add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
add_culdflags("-fPIC")
add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")
add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")
......@@ -75,6 +75,7 @@ target("infinirt-iluvatar")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
add_culdflags("-fPIC")
add_cxflags("-fPIC")
add_cxxflags("-fPIC")
......@@ -97,6 +98,7 @@ target("infiniccl-iluvatar")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
add_culdflags("-fPIC")
add_cxflags("-fPIC")
add_cxxflags("-fPIC")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment