Commit 46da1a27 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: cpu and cuda matmul

parents
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
class RMSNormDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
def rms_norm(x, w, eps):
input_dtype = x.dtype
hidden_states = x.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + eps)
return w * hidden_states.to(input_dtype)
def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float16, w_dtype=torch.float16):
print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
f" dtype:{dtype} w_dtype:{w_dtype}")
y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
x = torch.rand(x_shape, dtype=dtype).to(torch_device)
w = torch.ones(w_shape, dtype=w_dtype).to(torch_device)
eps = 1e-5
ans = rms_norm(x, w, eps)
y_tensor = to_tensor(y, lib)
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
descriptor = infiniopRMSNormDescriptor_t()
w_dataType = 0 if w_dtype==torch.float16 else 1
check_error(
lib.infiniopCreateRMSNormDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
w_tensor.descriptor, eps
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
w_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRMSNormWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, y.device)
check_error(
lib.infiniopRMSNorm(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# y_shape, x_shape, w_shape, dtype, w_dtype
((16, 2048), (16, 2048), (2048,), torch.float16, torch.float16),
((16, 2048), (16, 2048), (2048,), torch.float16, torch.float32),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateRMSNormDescriptor.restype = c_int32
lib.infiniopCreateRMSNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRMSNormDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
infiniopRMSNormDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRMSNorm.restypes = c_int32
lib.infiniopRMSNorm.argtypes = [
infiniopRMSNormDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
lib.infiniopDestroyRMSNormDescriptor.argtypes = [
infiniopRMSNormDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import ctypes
from ctypes import c_float, POINTER, c_void_p, c_int32, c_uint64, Structure, byref
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
U64,
)
from operatorspy.tests.test_utils import get_args
import torch
class RoPEDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRoPEDescriptor_t = POINTER(RoPEDescriptor)
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
ndim = x.ndim
assert 0 <= 1 < ndim
assert freqs_cis.shape == (x.shape[0], x.shape[-1])
shape = [d if i == 0 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
return freqs_cis.view(*shape)
def rotary_embedding(t, pos, theta, torch_device):
dh = t.shape[2]
freqs = (1.0 / (theta ** (torch.arange(0, dh, 2)[: (dh // 2)].float() / dh))).to(
torch_device
)
freqs = torch.outer(pos, freqs)
freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2))
freqs_cis = reshape_for_broadcast(freqs_cis, t_)
t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype)
return t_out
def sin_cos_table(max_seq_len, dim, torch_device, theta):
pos = torch.arange(
0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
)
freqs = (1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))).to(
torch_device
)
# (a0, a1, a2) -> (a0, a0, a1, a1, a2, a2)
freqs = torch.repeat_interleave(freqs, repeats=2)
angles = torch.outer(pos, freqs)
return torch.sin(angles), torch.cos(angles)
def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
print(
f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
)
t = torch.rand(shape, dtype=dtype)
if strides is not None:
t = rearrange_tensor(t, strides)
posTmp = torch.arange(0, t.shape[0])
pos = torch.zeros(2 * posTmp.shape[0], dtype = torch.int32)
for i in range(posTmp.shape[0]):
pos[2 * i] = posTmp[i]
pos[2 * i + 1] = 0
theta = 1e4
if torch_device == 'mlu' or torch_device == 'npu':
ans = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
pos = pos.to(torch_device)
t = t.to(torch_device)
else:
t = t.to(torch_device)
pos = pos.to(torch_device)
ans = rotary_embedding(t, posTmp.to(torch_device), theta, torch_device)
descriptor = infiniopRoPEDescriptor_t()
# 2x table length for test
sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
t_tensor = to_tensor(t, lib)
pos_tensor = to_tensor(pos[: t.shape[0]], lib)
pos_tensor.descriptor.contents.dt = U64
sin_table_tensor = to_tensor(sin_table, lib)
cos_table_tensor = to_tensor(cos_table, lib)
if torch_device == "npu":
torch.npu.synchronize()
check_error(
lib.infiniopCreateRoPEDescriptor(
handle,
byref(descriptor),
t_tensor.descriptor,
pos_tensor.descriptor,
sin_table_tensor.descriptor,
cos_table_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
t_tensor.descriptor.contents.invalidate()
pos_tensor.descriptor.contents.invalidate()
sin_table_tensor.descriptor.contents.invalidate()
cos_table_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRoPEWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, t.device)
check_error(
lib.infiniopRoPE(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
t_tensor.data,
pos_tensor.data,
sin_table_tensor.data,
cos_table_tensor.data,
None,
)
)
assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for shape, strides, dtype in test_cases:
test(lib, handle, "cpu", shape, strides, dtype)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for shape, strides, dtype in test_cases:
test(lib, handle, "cuda", shape, strides, dtype)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for shape, strides, dtype in test_cases:
test(lib, handle, "mlu", shape, strides, dtype)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases) :
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for shape, strides, dtype in test_cases:
test(lib, handle, "npu", shape, strides, dtype)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
((1, 32, 128), None, torch.float16),
((1, 32, 64), None, torch.float16),
# 昇腾暂不满足这个用例,最后一维度 <=32 会有问题,可能与其核心
# 接口 GatherMask 的内部实现相关,目前 48 64 128 都可以支持
((4, 1, 32), None, torch.float16),
((1, 32, 128), None, torch.float16),
((3, 32, 128), (8000, 200, 1), torch.float16),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateRoPEDescriptor.restype = c_int32
lib.infiniopCreateRoPEDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRoPEDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
lib.infiniopGetRoPEWorkspaceSize.argtypes = [
infiniopRoPEDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRoPE.restype = c_int32
lib.infiniopRoPE.argtypes = [
infiniopRoPEDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEDescriptor.restype = c_int32
lib.infiniopDestroyRoPEDescriptor.argtypes = [
infiniopRoPEDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
class SwiGLUDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopSwiGLUDescriptor_t = POINTER(SwiGLUDescriptor)
def swiglu(a, b):
return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
def test_out_of_place(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
dtype=torch.float16,
sync=None,
):
print(
f"Testing SwiGLU on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None:
c = rearrange_tensor(c, c_stride)
ans = swiglu(a, b)
if sync is not None:
sync()
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib)
descriptor = infiniopSwiGLUDescriptor_t()
check_error(
lib.infiniopCreateSwiGLUDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
c_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopSwiGLU(
descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None
)
)
assert torch.allclose(c, ans, atol=1e-4, rtol=1e-2)
print("out-of-place Test passed!")
check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
def test_in_place1(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
dtype=torch.float16,
sync=None,
):
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
ans = swiglu(a, b)
if sync is not None:
sync()
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
descriptor = infiniopSwiGLUDescriptor_t()
check_error(
lib.infiniopCreateSwiGLUDescriptor(
handle,
ctypes.byref(descriptor),
a_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopSwiGLU(
descriptor, a_tensor.data, a_tensor.data, b_tensor.data, None
)
)
assert torch.allclose(a, ans, atol=1e-4, rtol=1e-2)
print("in-place1 Test passed!")
check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
def test_in_place2(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
dtype=torch.float16,
sync=None,
):
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
ans = swiglu(a, b)
if sync is not None:
sync()
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
descriptor = infiniopSwiGLUDescriptor_t()
check_error(
lib.infiniopCreateSwiGLUDescriptor(
handle,
ctypes.byref(descriptor),
b_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopSwiGLU(
descriptor, b_tensor.data, a_tensor.data, b_tensor.data, None
)
)
assert torch.allclose(b, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroySwiGLUDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for shape, a_stride, b_stride, c_stride, dtype in test_cases:
test_out_of_place(
lib, handle, "cpu", shape, a_stride, b_stride, c_stride, dtype
)
test_in_place1(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
test_in_place2(lib, handle, "cpu", shape, a_stride, b_stride, dtype)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for shape, a_stride, b_stride, c_stride, dtype in test_cases:
test_out_of_place(
lib, handle, "cuda", shape, a_stride, b_stride, c_stride, dtype
)
test_in_place1(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
test_in_place2(lib, handle, "cuda", shape, a_stride, b_stride, dtype)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for shape, a_stride, b_stride, c_stride, dtype in test_cases:
test_out_of_place(
lib, handle, "mlu", shape, a_stride, b_stride, c_stride, dtype
)
test_in_place1(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
test_in_place2(lib, handle, "mlu", shape, a_stride, b_stride, dtype)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for shape, a_stride, b_stride, c_stride, dtype in test_cases:
test_out_of_place(
lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize
)
test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# shape, a_stride, b_stride, c_stride, dtype
((13, 4), None, None, None, torch.float16),
((13, 4), (10, 1), (10, 1), (10, 1), torch.float16),
((16, 5632), None, None, None, torch.float16),
((16, 5632), (13312, 1), (13312, 1), (13312, 1), torch.float16),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
lib.infiniopCreateSwiGLUDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopSwiGLUDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopSwiGLUDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopSwiGLUDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
print("\033[92mTest passed!\033[0m")
def get_args():
import argparse
parser = argparse.ArgumentParser(description="Test Operator")
parser.add_argument(
"--profile",
action="store_true",
help="Whether profile tests",
)
parser.add_argument(
"--cpu",
action="store_true",
help="Run CPU test",
)
parser.add_argument(
"--nvidia",
action="store_true",
help="Run NVIDIA GPU test",
)
parser.add_argument(
"--cambricon",
action="store_true",
help="Run Cambricon MLU test",
)
parser.add_argument(
"--ascend",
action="store_true",
help="Run ASCEND NPU test",
)
return parser.parse_args()
def synchronize_device(torch_device):
import torch
if torch_device == "cuda":
torch.cuda.synchronize()
elif torch_device == "npu":
torch.npu.synchronize()
elif torch_device == "mlu":
torch.mlu.synchronize()
add_rules("mode.debug", "mode.release")
-- Define color codes
local GREEN = '\27[0;32m'
local YELLOW = '\27[1;33m'
local NC = '\27[0m' -- No Color
add_includedirs("include")
if is_mode("debug") then
add_cxflags("-g -O0")
add_defines("DEBUG_MODE")
end
-- CPU
option("cpu")
set_default(true)
set_showmenu(true)
set_description("Whether to complie implementations for CPU")
option_end()
option("omp")
set_default(false)
set_showmenu(true)
set_description("Enable or disable OpenMP support for cpu kernel")
option_end()
if has_config("cpu") then
includes("xmake/cpu.lua")
add_defines("ENABLE_CPU_API")
end
-- 英伟达
option("nv-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Nvidia GPU")
option_end()
if has_config("nv-gpu") then
add_defines("ENABLE_CUDA_API")
includes("xmake/cuda.lua")
end
-- 寒武纪
option("cambricon-mlu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Cambricon MLU")
option_end()
if has_config("cambricon-mlu") then
add_defines("ENABLE_CAMBRICON_API")
end
-- 华为昇腾
option("ascend-npu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Huawei Ascend NPU")
option_end()
if has_config("ascend-npu") then
add_defines("ENABLE_ASCEND_API")
end
-- 沐曦
option("metax-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for MetaX GPU")
option_end()
if has_config("metax-gpu") then
add_defines("ENABLE_MACA_API")
end
-- 摩尔线程
option("moore-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Moore Threads GPU")
option_end()
if has_config("mthreads-gpu") then
add_defines("ENABLE_MUSA_API")
end
-- 海光
option("sugon-dcu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Sugon DCU")
option_end()
if has_config("sugon-dcu") then
add_defines("ENABLE_CUDA_API")
add_defines("ENABLE_SUGON_CUDA_API")
end
target("infiniop")
set_kind("shared")
if has_config("cpu") then
add_deps("infiniop-cpu")
end
if has_config("nv-gpu") then
add_deps("infiniop-cuda")
end
if has_config("sugon-dcu") then
local builddir = string.format(
"build/%s/%s/%s",
get_config("plat"),
get_config("arch"),
get_config("mode")
)
add_shflags("-s", "-shared", "-fPIC")
add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
-- Using -linfiniop-cuda will fail, manually link the target using full path
add_deps("nv-gpu", {inherit = false})
add_links(builddir.."/libinfiniop-cuda.a")
set_toolchains("sugon-dcu-linker")
end
if has_config("cambricon-mlu") then
add_deps("cambricon-mlu")
end
if has_config("ascend-npu") then
add_deps("ascend-npu")
end
if has_config("metax-gpu") then
add_deps("metax-gpu")
end
set_languages("cxx17")
add_files("src/infiniop/devices/handle.cc")
add_files("src/infiniop/ops/*/operator.cc")
add_files("src/infiniop/*.cc")
after_build(function (target) print(YELLOW .. "You can install the libraries with \"xmake install\"" .. NC) end)
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
add_installfiles("include/infiniop/(**/*.h)", {prefixdir = "include/infiniop"})
add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"})
add_installfiles("include/infiniop.h", {prefixdir = "include"})
add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end()
target("infiniop-cpu")
on_install(function (target) end)
set_kind("static")
if not is_plat("windows") then
add_cxflags("-fPIC")
end
set_languages("cxx17")
add_files("../src/infiniop/devices/cpu/*.cc", "../src/infiniop/ops/*/cpu/*.cc")
if has_config("omp") then
add_cxflags("-fopenmp")
add_ldflags("-fopenmp")
end
target_end()
\ No newline at end of file
local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
if CUDA_ROOT ~= nil then
add_includedirs(CUDA_ROOT .. "/include")
end
if CUDNN_ROOT ~= nil then
add_includedirs(CUDNN_ROOT .. "/include")
end
target("infiniop-cuda")
set_kind("static")
on_install(function (target) end)
set_policy("build.cuda.devlink", true)
set_toolchains("cuda")
add_links("cublas")
add_links("cudnn")
add_cugencodes("native")
if is_plat("windows") then
add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
if CUDNN_ROOT ~= nil then
add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
end
else
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxxflags("-fPIC")
end
set_languages("cxx17")
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment