Unverified Commit 4e4d3415 authored by Catheriany's avatar Catheriany Committed by GitHub
Browse files

Merge branch 'main' into issue/150

parents d1c46889 1a4cfb99
...@@ -44,46 +44,46 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_ ...@@ -44,46 +44,46 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
#define INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ACTION) \ #define INFINIRT_CALL_DEVICE_API_AND(DEVICE_TYPE, API, PARAMS, ACTION) \
{ \ { \
infiniStatus_t _status; \ infiniStatus_t _status; \
switch (CURRENT_DEVICE_TYPE) { \ switch (DEVICE_TYPE) { \
case INFINI_DEVICE_CPU: \ case INFINI_DEVICE_CPU: \
_status = infinirt::cpu::API PARAMS; \ _status = infinirt::cpu::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_NVIDIA: \ case INFINI_DEVICE_NVIDIA: \
_status = infinirt::cuda::API PARAMS; \ _status = infinirt::cuda::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_CAMBRICON: \ case INFINI_DEVICE_CAMBRICON: \
_status = infinirt::bang::API PARAMS; \ _status = infinirt::bang::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_ASCEND: \ case INFINI_DEVICE_ASCEND: \
_status = infinirt::ascend::API PARAMS; \ _status = infinirt::ascend::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_METAX: \ case INFINI_DEVICE_METAX: \
_status = infinirt::maca::API PARAMS; \ _status = infinirt::maca::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_MOORE: \ case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \ _status = infinirt::musa::API PARAMS; \
break; \ break; \
default: \ default: \
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \ } \
{ ACTION; } \ { ACTION; } \
return _status; \ return _status; \
} }
#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ) #define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(CURRENT_DEVICE_TYPE, API, PARAMS, )
__C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) { __C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) {
if (count == nullptr) { if (count == nullptr) {
return INFINI_STATUS_NULL_POINTER; return INFINI_STATUS_NULL_POINTER;
} }
INFINIRT_CALL_DEVICE_API(getDeviceCount, (count)); INFINIRT_CALL_DEVICE_API_AND(device, getDeviceCount, (count), {});
} }
__C infiniStatus_t infinirtSetDevice(infiniDevice_t device, int device_id) { __C infiniStatus_t infinirtSetDevice(infiniDevice_t device, int device_id) {
INFINIRT_CALL_DEVICE_API_AND(setDevice, (device_id), INFINIRT_CALL_DEVICE_API_AND(device, setDevice, (device_id),
{ CURRENT_DEVICE_TYPE = device; { CURRENT_DEVICE_TYPE = device;
CURRENT_DEVICE_ID = device_id; }); CURRENT_DEVICE_ID = device_id; });
} }
......
...@@ -138,4 +138,73 @@ void rearrange( ...@@ -138,4 +138,73 @@ void rearrange(
} }
} }
utils::Result<RearrangeMeta> RearrangeMeta::distributeUnit(const std::vector<size_t> &candidates) const {
// 获取当前的unit大小
size_t current_unit = _meta[0];
// 寻找满足条件的unit值:当前unit能被其整除
size_t new_unit = 0;
for (size_t candidate : candidates) {
if (current_unit % candidate == 0) {
new_unit = candidate;
break;
}
}
// 如果没找到合适的值,返回错误
if (new_unit == 0) {
return INFINI_STATUS_BAD_PARAM;
}
// 如果找到的值就是当前unit,返回自身的副本
if (new_unit == current_unit) {
return Result<RearrangeMeta>(_meta);
}
// 获取当前维度
size_t ndim_value = this->ndim();
// 创建新的布局数组
std::vector<ptrdiff_t> layout(2 + (ndim_value + 1) * 3, 0);
// 设置新的unit值
layout[0] = new_unit;
// 计算扩展因子
ptrdiff_t extra = current_unit / new_unit;
// 计算步长指针的偏移量
ptrdiff_t idx_offset = 1;
// 在新布局中设置相应的指针
ptrdiff_t *new_idx = layout.data() + 1;
ptrdiff_t *new_dst = layout.data() + 2 + (ndim_value + 1);
ptrdiff_t *new_src = layout.data() + 2 + (ndim_value + 1) * 2;
// 复制并调整索引步长
// 索引步长需要重新计算
// 首先复制原来的索引步长
for (size_t i = 0; i < ndim_value + 1; ++i) {
new_idx[i] = _meta[idx_offset + i] * extra;
}
// 设置最后一个维度的步长为1
new_idx[ndim_value + 1] = 1;
// 复制目标步长数据,并添加新单元大小
for (size_t i = 0; i < ndim_value; ++i) {
new_dst[i] = dst_strides()[i];
}
new_dst[ndim_value] = new_unit;
// 复制源步长数据,并添加新单元大小
for (size_t i = 0; i < ndim_value; ++i) {
new_src[i] = src_strides()[i];
}
new_src[ndim_value] = new_unit;
return Result<RearrangeMeta>(layout);
}
} // namespace utils } // namespace utils
...@@ -28,6 +28,9 @@ public: ...@@ -28,6 +28,9 @@ public:
const ptrdiff_t *src_strides() const; const ptrdiff_t *src_strides() const;
void launch(void *dst, const void *src) const; void launch(void *dst, const void *src) const;
// 拆分 unit 到更小的规模以利于并行
utils::Result<RearrangeMeta> distributeUnit(const std::vector<size_t> &candidates) const;
}; };
void rearrange( void rearrange(
......
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def mul(
a: np.ndarray,
b: np.ndarray
):
return np.multiply(a, b)
def random_tensor(shape, dtype):
rate = 1e-3
var = 0.5 * rate # 数值范围在[-5e-4, 5e-4]
return rate * np.random.rand(*shape).astype(dtype) - var
class MulTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
stride_a: List[int] | None,
b: np.ndarray,
stride_b: List[int] | None,
c: np.ndarray,
stride_c: List[int] | None,
):
super().__init__("mul")
self.a = a
self.stride_a = stride_a
self.b = b
self.stride_b = stride_b
self.c = c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
if self.stride_c is not None:
test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
a_fp64 = self.a.astype(np.float64)
b_fp64 = self.b.astype(np.float64)
ans_fp64 = np.multiply(a_fp64, b_fp64)
ans = mul(self.a, self.b)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("ans_fp64"),
ans_fp64,
raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
)
if __name__ == '__main__':
test_writer = InfiniopTestWriter("mul.gguf")
test_cases = [
MulTestCase(
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
random_tensor((2, 3), np.float32),
gguf_strides(1, 2),
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
),
MulTestCase(
random_tensor((2, 3), np.float16),
gguf_strides(1, 2),
random_tensor((2, 3), np.float16),
gguf_strides(3, 1),
random_tensor((2, 3), np.float16),
gguf_strides(1, 2),
),
MulTestCase(
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(1, 2),
),
MulTestCase(
random_tensor((4, 6), np.float16),
gguf_strides(1, 4),
random_tensor((4, 6), np.float16),
gguf_strides(1, 5),
random_tensor((4, 6), np.float16),
gguf_strides(6, 1),
),
MulTestCase(
random_tensor((1, 2048), np.float16),
gguf_strides(1, 1),
random_tensor((1, 2048), np.float16),
gguf_strides(2048, 1),
random_tensor((1, 2048), np.float16),
gguf_strides(1, 1),
),
MulTestCase(
random_tensor((2048, 2048), np.float32),
None,
random_tensor((2048, 2048), np.float32),
gguf_strides(1, 2048),
random_tensor((2048, 2048), np.float32),
None,
),
MulTestCase(
random_tensor((2, 4, 2048), np.float16),
gguf_strides(4 * 2048, 2048, 1),
random_tensor((2, 4, 2048), np.float16),
gguf_strides(1, 2, 2 * 4),
random_tensor((2, 4, 2048), np.float16),
gguf_strides(4 * 2048, 2048, 1),
),
MulTestCase(
random_tensor((2, 4, 2048), np.float32),
gguf_strides(1, 2, 2 * 4),
random_tensor((2, 4, 2048), np.float32),
None,
random_tensor((2, 4, 2048), np.float32),
gguf_strides(1, 2, 2 * 4),
),
MulTestCase(
random_tensor((2048, 2560), np.float32),
gguf_strides(2560, 1),
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2048),
random_tensor((2048, 2560), np.float32),
gguf_strides(2560, 1),
),
MulTestCase(
random_tensor((4, 48, 64), np.float16),
gguf_strides(64 * 48, 64, 1),
random_tensor((4, 48, 64), np.float16),
gguf_strides(1, 4, 4 * 48),
random_tensor((4, 48, 64), np.float16),
None
),
MulTestCase(
random_tensor((4, 48, 64), np.float32),
None,
random_tensor((4, 48, 64), np.float32),
gguf_strides(1, 4, 4 * 48),
random_tensor((4, 48, 64), np.float32),
gguf_strides(48 * 64, 64, 1),
)
]
test_writer.add_tests(test_cases)
test_writer.save()
from ctypes import POINTER, Structure, c_int32, c_void_p import torch
import ctypes import ctypes
import sys from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import os from libinfiniop import (
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t, infiniopHandle_t,
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
create_handle, open_lib,
destroy_handle, to_tensor,
get_test_devices,
check_error, check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
) )
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto from enum import Enum, auto
import torch
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum): class Inplace(Enum):
...@@ -26,6 +43,35 @@ class Inplace(Enum): ...@@ -26,6 +43,35 @@ class Inplace(Enum):
INPLACE_B = auto() INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AddDescriptor(Structure): class AddDescriptor(Structure):
_fields_ = [("device", c_int32)] _fields_ = [("device", c_int32)]
...@@ -37,42 +83,71 @@ def add(x, y): ...@@ -37,42 +83,71 @@ def add(x, y):
return torch.add(x, y) return torch.add(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test( def test(
lib, lib,
handle, handle,
torch_device, torch_device,
c_shape, shape,
a_shape, a_stride=None,
b_shape, b_stride=None,
tensor_dtype=torch.float16, c_stride=None,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
): ):
print( print(
f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
) )
if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
print("Unsupported test: broadcasting does not support in-place")
return
a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) b = torch.rand(shape, dtype=dtype).to(torch_device)
c = ( c = torch.rand(shape, dtype=dtype).to(torch_device)
torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
ans = add(a, b) ans = add(a, b)
a_tensor = to_tensor(a, lib) a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
b_tensor = to_tensor(b, lib)
c_tensor = ( c_tensor = (
to_tensor(c, lib) to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
) )
descriptor = infiniopAddDescriptor_t() if sync is not None:
sync()
descriptor = infiniopAddDescriptor_t()
check_error( check_error(
lib.infiniopCreateAddDescriptor( lib.infiniopCreateAddDescriptor(
handle, handle,
...@@ -84,74 +159,48 @@ def test( ...@@ -84,74 +159,48 @@ def test(
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor.descriptor.contents.invalidate() for tensor in [a_tensor, b_tensor, c_tensor]:
a_tensor.descriptor.contents.invalidate() tensor.destroyDesc(lib)
b_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None) lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
) )
assert torch.allclose(c, ans, atol=0, rtol=1e-3) workspace = create_workspace(workspace_size.value, c.device)
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
def lib_add():
check_error(
def test_cpu(lib, test_cases): lib.infiniopAdd(
device = DeviceEnum.DEVICE_CPU descriptor,
handle = create_handle(lib, device) workspace.data_ptr() if workspace is not None else None,
for c_shape, a_shape, b_shape, inplace in test_cases: workspace_size.value,
# fmt: off c_tensor.data,
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) a_tensor.data,
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) b_tensor.data,
# fmt: on None,
destroy_handle(lib, handle) )
)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
# fmt: on
destroy_handle(lib, handle)
lib_add()
def test_bang(lib, test_cases): atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
import torch_mlu if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
device = DeviceEnum.DEVICE_BANG # Profiling workflow
handle = create_handle(lib, device) if PROFILE:
for c_shape, a_shape, b_shape, inplace in test_cases:
# fmt: off # fmt: off
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) profile_operation("PyTorch", lambda: add(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) profile_operation(" lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
destroy_handle(lib, handle) check_error(lib.infiniopDestroyAddDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [
# fmt: off
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
((), (), (), Inplace.OUT_OF_PLACE),
((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
# fmt: on
]
args = get_args() args = get_args()
lib = open_lib() lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32 lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [ lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t, infiniopHandle_t,
...@@ -160,25 +209,36 @@ if __name__ == "__main__": ...@@ -160,25 +209,36 @@ if __name__ == "__main__":
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
] ]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopAddDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAdd.restype = c_int32 lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [ lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t, infiniopAddDescriptor_t,
c_void_p, c_void_p,
c_uint64,
c_void_p,
c_void_p, c_void_p,
c_void_p, c_void_p,
c_void_p, c_void_p,
] ]
lib.infiniopDestroyAddDescriptor.restype = c_int32 lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [ lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t, infiniopAddDescriptor_t,
] ]
if args.cpu: # Configure testing options
test_cpu(lib, test_cases) DEBUG = args.debug
if args.cuda: PROFILE = args.profile
test_cuda(lib, test_cases) NUM_PRERUN = args.num_prerun
if args.bang: NUM_ITERATIONS = args.num_iterations
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang): for device in get_test_devices(args):
test_cpu(lib, test_cases) test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMulDescriptor_t = POINTER(MulDescriptor)
def mul(x, y):
return torch.mul(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = mul(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopMulDescriptor_t()
check_error(
lib.infiniopCreateMulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, c.device)
def lib_mul():
check_error(
lib.infiniopMul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
lib_mul()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyMulDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopMulDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopMulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopMulDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
...@@ -17,19 +17,88 @@ from libinfiniop import ( ...@@ -17,19 +17,88 @@ from libinfiniop import (
profile_operation, profile_operation,
) )
def row_major_strides(shape):
"""生成张量的行优先(C风格)stride
Args:
shape: 张量形状
Returns:
行优先strides列表
"""
# 行优先 (C风格,从最后一维到第一维)
stride = 1
strides = [1]
for dim in reversed(shape[1:]):
stride *= dim
strides.insert(0, stride)
return strides
def column_major_strides(shape):
"""生成张量的列优先(Fortran风格)stride
Args:
shape: 张量形状
Returns:
列优先strides列表
"""
# 列优先 (Fortran风格,从第一维到最后一维)
stride = 1
strides = [stride]
for dim in shape[:-1]:
stride *= dim
strides.append(stride)
return strides
# ============================================================================== # ==============================================================================
# Configuration (Internal Use Only) # Configuration (Internal Use Only)
# ============================================================================== # ==============================================================================
# These are not meant to be imported from other modules # These are not meant to be imported from other modules
_TEST_CASES = [ _TEST_CASES = [
# ((src_shape, src_stride), (dst_shape, dst_stride)) # (shape, x_stride, y_stride)
(((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))), (
(((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)), (2, 4, 64), # shape
(((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))), (2, 4, 8), # x_stride
(((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))), (512, 128, 2) # y_stride
(((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))), ),
(((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))), (
(((64,), (1,)), ((64,), (1,))), (100, 100), # shape
(1, 100), # x_stride
(100, 1) # y_stride
),
(
(4, 4), # shape
(1, 4), # x_stride
(4, 1) # y_stride
),
(
(4, 6, 64), # shape
(64, 4*64, 1), # x_stride
(6*64, 64, 1) # y_stride
),
(
(2000, 2000), # shape
(1, 2000), # x_stride
(2000, 1) # y_stride
),
(
(2001, 2001), # shape
(1, 2001), # x_stride
(2001, 1) # y_stride
),
(
(3, 4, 7, 53, 9), # shape
row_major_strides((3, 4, 7, 53, 9)), # x_stride
column_major_strides((3, 4, 7, 53, 9)) # y_stride
),
(
(3, 4, 50, 50, 5, 7), # shape
row_major_strides((3, 4, 50, 50, 5, 7)), # x_stride
column_major_strides((3, 4, 50, 50, 5, 7)) # y_stride
),
] ]
# Data types used for testing # Data types used for testing
...@@ -58,23 +127,23 @@ def test( ...@@ -58,23 +127,23 @@ def test(
lib, lib,
handle, handle,
torch_device, torch_device,
x_shape, shape,
x_stride, x_stride,
y_shape,
y_stride, y_stride,
dtype=torch.float16, dtype=torch.float16,
): ):
print( print(
f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} dtype:{dtype}" f"Testing Rerrange on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype}"
) )
x = torch.rand(x_shape, dtype=dtype).to(torch_device) x = torch.rand(shape, dtype=dtype).to(torch_device)
y = torch.zeros(y_shape, dtype=dtype).to(torch_device) y = torch.zeros(shape, dtype=dtype).to(torch_device)
x, y = [ x, y = [
rearrange_if_needed(tensor, stride) rearrange_if_needed(tensor, stride)
for tensor, stride in zip([x, y], [x_stride, y_stride]) for tensor, stride in zip([x, y], [x_stride, y_stride])
] ]
x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]] x_tensor, y_tensor = [to_tensor(tensor, lib) for tensor in [x, y]]
descriptor = infiniopRearrangeDescriptor_t() descriptor = infiniopRearrangeDescriptor_t()
...@@ -86,7 +155,7 @@ def test( ...@@ -86,7 +155,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [x_tensor, y_tensor]: for tensor in [x_tensor, y_tensor]:
tensor.descriptor.contents.invalidate() tensor.destroyDesc(lib)
def lib_rearrange(): def lib_rearrange():
check_error( check_error(
......
...@@ -61,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32] ...@@ -61,7 +61,8 @@ _TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2}, torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 2e-7, "rtol": 1e-7},
} }
DEBUG = False DEBUG = False
......
...@@ -118,6 +118,18 @@ if has_config("kunlun-xpu") then ...@@ -118,6 +118,18 @@ if has_config("kunlun-xpu") then
includes("xmake/kunlun.lua") includes("xmake/kunlun.lua")
end end
-- InfiniCCL
option("ccl")
set_default(false)
set_default(false)
set_showmenu(true)
set_description("Wether to complie implementations for InfiniCCL")
option_end()
if has_config("ccl") then
add_defines("ENABLE_CCL")
end
target("infini-utils") target("infini-utils")
set_kind("static") set_kind("static")
on_install(function (target) end) on_install(function (target) end)
...@@ -220,10 +232,25 @@ target("infiniop") ...@@ -220,10 +232,25 @@ target("infiniop")
add_installfiles("include/infinicore.h", {prefixdir = "include"}) add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end() target_end()
target("infiniccl")
set_kind("shared")
add_deps("infinirt")
if has_config("nv-gpu") then
add_deps("infiniccl-cuda")
end
set_languages("cxx17")
add_files("src/infiniccl/*.cc")
add_installfiles("include/infiniccl.h", {prefixdir = "include"})
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
target("all") target("all")
set_kind("phony") set_kind("phony")
add_deps("infiniop", "infinirt") add_deps("infiniop", "infinirt", "infiniccl")
after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end) after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
target_end() target_end()
......
...@@ -58,3 +58,34 @@ target("infinirt-cuda") ...@@ -58,3 +58,34 @@ target("infinirt-cuda")
set_languages("cxx17") set_languages("cxx17")
add_files("../src/infinirt/cuda/*.cu") add_files("../src/infinirt/cuda/*.cu")
target_end() target_end()
target("infiniccl-cuda")
set_kind("static")
add_deps("infinirt")
on_install(function (target) end)
if has_config("ccl") then
set_policy("build.cuda.devlink", true)
set_toolchains("cuda")
add_links("cudart")
if not is_plat("windows") then
add_cuflags("-Xcompiler=-fPIC")
add_culdflags("-Xcompiler=-fPIC")
add_cxflags("-fPIC")
local nccl_root = os.getenv("NCCL_ROOT")
if nccl_root then
add_includedirs(nccl_root .. "/include")
add_links(nccl_root .. "/lib/libnccl.so")
else
add_links("nccl") -- Fall back to default nccl linking
end
add_files("../src/infiniccl/cuda/*.cu")
else
print("[Warning] NCCL is not supported on Windows")
end
end
set_languages("cxx17")
target_end()
...@@ -34,3 +34,20 @@ target("infiniop-test") ...@@ -34,3 +34,20 @@ target("infiniop-test")
set_installdir(INFINI_ROOT) set_installdir(INFINI_ROOT)
target_end() target_end()
target("infiniccl-test")
set_kind("binary")
add_deps("infini-utils")
set_default(false)
set_warnings("all", "error")
set_languages("cxx17")
local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
add_includedirs(INFINI_ROOT.."/include")
add_linkdirs(INFINI_ROOT.."/lib")
add_links("infinirt", "infiniccl")
add_files(os.projectdir().."/src/infiniccl-test/*.cpp")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment