Commit 46da1a27 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: cpu and cuda matmul

parents
import libinfiniop
\ No newline at end of file
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto
import torch
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
class AddDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAddDescriptor_t = POINTER(AddDescriptor)
def add(x, y):
return torch.add(x, y)
def test(
lib,
handle,
torch_device,
c_shape,
a_shape,
b_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
):
print(
f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
)
if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
print("Unsupported test: broadcasting does not support in-place")
return
a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
ans = add(a, b)
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
descriptor = infiniopAddDescriptor_t()
check_error(
lib.infiniopCreateAddDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor.descriptor.contents.invalidate()
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
)
assert torch.allclose(c, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
((), (), (), Inplace.OUT_OF_PLACE),
((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAddDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn.functional as F
class AttentionDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
y = x.clone()
masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
return torch.nn.functional.softmax(masked, dim=-1).to(type)
def attention(q, k, v, k_cache, v_cache, pos):
type = q.dtype
n_q_head = q.shape[0]
n_kv_head = k.shape[0]
# Concatenate key and value caches
k_cache = k_cache[:, :pos, :] # (n_kv_head, pos, head_dim)
v_cache = v_cache[:, :pos, :] # (n_kv_head, pos, head_dim)
k = torch.cat([k_cache, k], dim=1) # (n_kv_head, total_seq_len, head_dim)
v = torch.cat([v_cache, v], dim=1) # (n_kv_head, total_seq_len, head_dim)
total_seq_len = k.shape[1]
head_dim = v.shape[-1]
if n_q_head != n_kv_head:
q = q.reshape(
n_kv_head, -1, head_dim
) # (n_kv_head, n_group * seq_len, head_dim)
# Scaled dot-product attention
attn_scores = (
torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
.to(type)
.reshape(n_q_head, -1, total_seq_len)
) # (n_q_head, seq_len, total_seq_len)
attn_scores = attn_scores / (head_dim**0.5)
attn_weights = causal_softmax(attn_scores).reshape(
n_kv_head, -1, total_seq_len
) # (n_kv_head, seq_len, total_seq_len)
# Weighted sum of values
attn_output = (
torch.einsum(
"hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
)
.to(type)
.reshape(n_q_head, -1, head_dim)
.permute(1, 0, 2)
) # ([seq_len, n_q_head, head_dim])
return attn_output
def test(
lib,
handle,
torch_device,
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype=torch.float16,
q_stride=None,
k_stride=None,
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
)
out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k_cache = (
torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
)
v_cache = (
torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
)
ans = attention(q, k, v, k_cache, v_cache, pos)
if q_stride is not None:
q = rearrange_tensor(q, q_stride)
if k_stride is not None:
k = rearrange_tensor(k, k_stride)
if v_stride is not None:
v = rearrange_tensor(v, v_stride)
if k_cache_stride is not None:
k_cache = rearrange_tensor(k_cache, k_cache_stride)
if v_cache_stride is not None:
v_cache = rearrange_tensor(v_cache, v_cache_stride)
out_tensor = to_tensor(out, lib)
q_tensor = to_tensor(q, lib)
k_tensor = to_tensor(k, lib)
v_tensor = to_tensor(v, lib)
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
descriptor = infiniopAttentionDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
handle,
ctypes.byref(descriptor),
out_tensor.descriptor,
q_tensor.descriptor,
k_tensor.descriptor,
v_tensor.descriptor,
k_cache_tensor.descriptor,
v_cache_tensor.descriptor,
pos,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor.descriptor.contents.invalidate()
q_tensor.descriptor.contents.invalidate()
k_tensor.descriptor.contents.invalidate()
v_tensor.descriptor.contents.invalidate()
k_cache_tensor.descriptor.contents.invalidate()
v_cache_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, out.device)
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# prefill
(
32, # n_q_head
4, # n_kv_head
5, # seq_len
64, # head_dim
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
[64, 11264, 1], # k_cache_stride
[64, 11264, 1], # v_cache_stride
),
# decode
(
32, # n_q_head
4, # n_kv_head
1, # seq_len
64, # head_dim
3, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
[64, 11264, 1], # k_cache_stride
[64, 11264, 1], # v_cache_stride
),
# for test
(
8, # n_q_head
4, # n_kv_head
2, # seq_len
16, # head_dim
1, # pos
8, # k_cache_buf_len
8, # v_cache_buf_len
torch.float16, # dtype
None, # q_stride
None, # k_stride
None, # v_stride
None, # k_cache_stride
None, # v_cache_stride
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAttentionDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_uint64,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopAttentionDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopAttentionDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopAttentionDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation = 1):
pooling_layers = {
1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d,
3: torch.nn.AvgPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
infiniopAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAvgPool.restype = c_int32
lib.infiniopAvgPool.argtypes = [
infiniopAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
infiniopAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
class CausalSoftmaxDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
y = x.clone()
masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
return torch.nn.functional.softmax(masked, dim=-1).to(type)
def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
print(
f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
)
x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
ans = causal_softmax(x)
x_tensor = to_tensor(x, lib)
descriptor = infiniopCausalSoftmaxDescriptor_t()
check_error(
lib.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), x_tensor.descriptor
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetCausalSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
workspace = create_workspace(workspace_size.value, x.device)
check_error(
lib.infiniopCausalSoftmax(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
x_tensor.data,
None,
)
)
assert torch.allclose(x, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "cpu", x_shape, x_stride)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "cuda", x_shape, x_stride)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "mlu", x_shape, x_stride)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "npu", x_shape, x_stride)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, x_stride
((32, 20, 512), None),
((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
]
args = get_args()
lib = open_lib()
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopCausalSoftmaxDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
import math
import ctypes
from torch.nn import functional as F
from typing import List, Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ConvDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def conv(x, w, stride, padding, dilation):
match len(x.shape) - 2:
case 1:
return F.conv1d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case 2:
return F.conv2d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case 3:
return F.conv3d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case _:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
# infer the shape of the output given the inputs for a N-ary convolution
def inferShape(
x_shape: List[int],
w_shape: List[int],
pads: List[int],
strides: List[int],
dilations: List[int],
) -> Tuple[int, ...]:
assert (
len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims = [
math.floor(
(x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
/ strides[i]
+ 1
)
for i in range(len(pads))
]
return (x_shape[0], w_shape[0]) + tuple(output_dims)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
w_shape,
pads,
strides,
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
):
assert len(pads) == len(strides) == len(dilations)
print(
f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(
inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = conv(x, w, strides, pads, dilations)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = conv(x, w, strides, pads, dilations)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopConvDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w_tensor.descriptor,
tuple_to_void_p(pads),
tuple_to_void_p(strides),
tuple_to_void_p(dilations),
len(pads),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
w_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
if (tensor_dtype == torch.float16):
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
else:
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyConvDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, w_shape, pads, strides, dilations, x_strides
(
(32, 3, 4),
(32, 3, 5),
(1,),
(1,),
(1,),
None,
),
(
(1, 3, 4, 4),
(2, 3, 3, 3),
(1, 1),
(1, 2),
(2, 1),
None,
),
(
(32, 3, 128, 128),
(64, 3, 5, 5),
(2, 2),
(2, 2),
(1, 1),
None,
),
(
(1, 1, 4, 4, 4),
(1, 1, 5, 5, 5),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
None,
),
(
(32, 3, 32, 32, 32),
(64, 3, 5, 5, 5),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
None,
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopConvDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopConvDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopConvDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ExpandDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
def expand(x, y):
if PROFILE:
ans = x.expand_as(y).clone()
torch.cuda.synchronize()
return ans
return x.expand_as(y)
def test(
lib,
handle,
torch_device,
y_shape,
x_shape,
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = expand(x, y)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = expand(x, y)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# y_shape, x_shape, y_stride, x_stride
((), (), None, None),
((3, 3), (1,), None, None),
((5, 4, 3), (4, 3,), None, (6, 1)),
((99, 111), (111,), None, None),
((2, 4, 3), (1, 3), None, None),
((2, 20, 3), (2, 1, 3), None, None),
((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateExpandDescriptor.restype = c_int32
lib.infiniopCreateExpandDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopExpandDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopExpand.restype = c_int32
lib.infiniopExpand.argtypes = [
infiniopExpandDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyExpandDescriptor.restype = c_int32
lib.infiniopDestroyExpandDescriptor.argtypes = [
infiniopExpandDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GEMMDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
A = A.T if transA else A
B = B.T if transB else B
result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
if C is not None:
result += beta * C if dtype != torch.float16 else C.to(torch.float32)
if PROFILE:
torch.cuda.synchronize()
return result
def test(
lib,
handle,
torch_device,
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride=None,
b_stride=None,
c_stride=None,
y_stride=None,
dtype=torch.float16,
):
print(
f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
)
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
y = torch.rand(y_shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None and c is not None:
c = rearrange_tensor(c, c_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if c is not None else None
y_tensor = to_tensor(y, lib)
descriptor = infiniopGEMMDescriptor_t()
check_error(
lib.infiniopCreateGEMMDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c_tensor.descriptor if c_tensor else None,
alpha,
beta,
transA,
transB,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
if c_tensor is not None:
c_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspace_size = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGEMMWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGEMM(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGEMM(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
(
1.0,
1.0,
False,
False,
(1, 2048),
(2048, 2048),
(1, 2048),
(1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
True,
(2048, 4),
(2048, 2048),
(4, 2048),
(4, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
False,
True,
(1, 2048),
(1000, 2048),
(1000),
(1, 1000),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
(2048),
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
(
1.0,
1.0,
False,
False,
(3, 1, 2048),
(3, 2048, 2048),
(1,),
(3, 1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
None,
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGEMMDescriptor.restype = c_int32
lib.infiniopCreateGEMMDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGEMMDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
c_float,
c_bool,
c_bool,
]
lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
lib.infiniopGetGEMMWorkspaceSize.argtypes = [
infiniopGEMMDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGEMM.restype = c_int32
lib.infiniopGEMM.argtypes = [
infiniopGEMMDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGEMMDescriptor.restype = c_int32
lib.infiniopDestroyGEMMDescriptor.argtypes = [
infiniopGEMMDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch, time
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GlobalAvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
def inferShape(x):
return x.shape[:2] + (1,) * (x.dim() - 2)
def globalAvgPool(x):
y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
if PROFILE:
torch.cuda.synchronize()
return y.view(*inferShape(x))
def test(
lib,
handle,
torch_device,
x_shape,
tensor_dtype=torch.float16,
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = globalAvgPool(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = globalAvgPool(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGlobalAvgPoolWorkspaceSize(
descriptor, ctypes.byref(workspaceSize)
)
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGlobalAvgPool(
descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape
((1, 3, 3)),
((1, 3, 1, 1, 3)),
((1, 3, 1, 1, 257)),
((1, 2, 1, 1, 514)),
((1, 3, 1, 1, 1025)),
((32, 256, 1, 112, 112)),
((2, 3, 2048000)),
((2, 1, 10243)),
((2, 20, 100)),
((3, 33, 333)),
((32, 20, 512)),
((3, 3, 11, 11, 11, 3, 2)),
((32, 256, 1, 112, 112)),
((32, 256, 112, 112)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGlobalAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGlobalAvgPool.restype = c_int32
lib.infiniopGlobalAvgPool.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
from .devices import *
from .utils import *
from .datatypes import *
class InfiniDtype:
INVALID = 0
BYTE = 1
BOOL = 2
I8 = 3
I16 = 4
I32 = 5
I64 = 6
U8 = 7
U16 = 8
U32 = 9
U64 = 10
F8 = 11
F16 = 12
F32 = 13
F64 = 14
C8 = 15
C16 = 16
C32 = 17
C64 = 18
BF16 = 19
class InfiniDeviceEnum:
CPU = 0
NVIDIA = 1
CAMBRICON = 2
ASCEND = 3
METAX = 4,
MOORE = 5,
ILUVATAR = 6,
KUNLUN = 7,
SUGON = 8,
from calendar import c
import os
import platform
import ctypes
from ctypes import c_int, c_int64, c_uint64, Structure, POINTER, c_size_t
from .datatypes import *
from .devices import *
Device = c_int
Optype = c_int
INFINI_ROOT = os.environ.get("INFINI_ROOT")
class TensorDescriptor(Structure):
_fields_ = [
("dtype", c_int),
("ndim", c_size_t),
("shape", POINTER(c_size_t)),
("strides", POINTER(c_int64)),
]
def invalidate(self):
for i in range(self.ndim):
self.shape[i] = 0
self.strides[i] = 0
infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
class CTensor:
def __init__(self, desc, data):
self.descriptor = desc
self.data = data
class Handle(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopHandle_t = POINTER(Handle)
# Open operators library
def open_lib():
def find_library_in_ld_path(subdir, library_name):
ld_library_path = os.path.join(INFINI_ROOT, subdir)
paths = ld_library_path.split(os.pathsep)
for path in paths:
full_path = os.path.join(path, library_name)
if os.path.isfile(full_path):
return full_path
return None
system_name = platform.system()
# Load the library
if system_name == "Windows":
library_path = find_library_in_ld_path("bin", "infiniop.dll")
elif system_name == "Linux":
library_path = find_library_in_ld_path("lib", "libinfiniop.so")
assert (
library_path is not None
), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
ctypes.CDLL(r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\cudnn64_9.dll")
lib = ctypes.CDLL(library_path)
lib.infiniopCreateTensorDescriptor.argtypes = [
POINTER(infiniopTensorDescriptor_t),
c_uint64,
POINTER(c_uint64),
POINTER(c_int64),
c_int,
]
lib.infiniopCreateHandle.argtypes = [POINTER(infiniopHandle_t), c_int, c_int]
lib.infiniopCreateHandle.restype = c_int
lib.infiniopDestroyHandle.argtypes = [infiniopHandle_t]
lib.infiniopDestroyHandle.restype = c_int
return lib
import ctypes
from .datatypes import *
from .liboperators import infiniopTensorDescriptor_t, CTensor, infiniopHandle_t
def check_error(status):
if status != 0:
raise Exception("Error code " + str(status))
def to_tensor(tensor, lib):
"""
Convert a PyTorch tensor to a library Tensor(descriptor, data).
"""
import torch
ndim = tensor.ndimension()
shape = (ctypes.c_size_t * ndim)(*tensor.shape)
strides = (ctypes.c_int64 * ndim)(*(tensor.stride()))
data_ptr = tensor.data_ptr()
# fmt: off
dt = (
InfiniDtype.I8 if tensor.dtype == torch.int8 else
InfiniDtype.I16 if tensor.dtype == torch.int16 else
InfiniDtype.I32 if tensor.dtype == torch.int32 else
InfiniDtype.I64 if tensor.dtype == torch.int64 else
InfiniDtype.U8 if tensor.dtype == torch.uint8 else
InfiniDtype.F16 if tensor.dtype == torch.float16 else
InfiniDtype.BF16 if tensor.dtype == torch.bfloat16 else
InfiniDtype.F32 if tensor.dtype == torch.float32 else
InfiniDtype.F64 if tensor.dtype == torch.float64 else
# TODO: These following types may not be supported by older
# versions of PyTorch.
InfiniDtype.U16 if tensor.dtype == torch.uint16 else
InfiniDtype.U32 if tensor.dtype == torch.uint32 else
InfiniDtype.U64 if tensor.dtype == torch.uint64 else
None
)
# fmt: on
assert dt is not None
# Create TensorDecriptor
tensor_desc = infiniopTensorDescriptor_t()
lib.infiniopCreateTensorDescriptor(
ctypes.byref(tensor_desc), ndim, shape, strides, dt
)
# Create Tensor
return CTensor(tensor_desc, data_ptr)
def create_workspace(size, torch_device):
if size == 0:
return None
import torch
return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
def create_handle(lib, device, id=0):
handle = infiniopHandle_t()
check_error(lib.infiniopCreateHandle(ctypes.byref(handle), device, id))
return handle
def destroy_handle(lib, handle):
check_error(lib.infiniopDestroyHandle(handle))
def rearrange_tensor(tensor, new_strides):
"""
Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
"""
import torch
shape = tensor.shape
new_size = [0] * len(shape)
left = 0
right = 0
for i in range(len(shape)):
if new_strides[i] > 0:
new_size[i] = (shape[i] - 1) * new_strides[i] + 1
right += new_strides[i] * (shape[i] - 1)
else: # TODO: Support negative strides in the future
# new_size[i] = (shape[i] - 1) * (-new_strides[i]) + 1
# left += new_strides[i] * (shape[i] - 1)
raise ValueError("Negative strides are not supported yet")
# Create a new tensor with zeros
new_tensor = torch.zeros(
(right - left + 1,), dtype=tensor.dtype, device=tensor.device
)
# Generate indices for original tensor based on original strides
indices = [torch.arange(s) for s in shape]
mesh = torch.meshgrid(*indices, indexing="ij")
# Flatten indices for linear indexing
linear_indices = [m.flatten() for m in mesh]
# Calculate new positions based on new strides
new_positions = sum(
linear_indices[i] * new_strides[i] for i in range(len(shape))
).to(tensor.device)
offset = -left
new_positions += offset
# Copy the original data to the new tensor
new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
return new_tensor
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
import ctypes
import sys
import os
import time
sys.path.append("..")
from libinfiniop import (
open_lib,
to_tensor,
CTensor,
InfiniDeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from test_utils import get_args, synchronize_device
import torch
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MatmulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
def matmul(_c, beta, _a, _b, alpha):
a = _a.clone()
b = _b.clone()
c = _c.clone()
input_dtype = c.dtype
ans = (
alpha * torch.matmul(a.to(torch.float32), b.to(torch.float32)).to(input_dtype)
+ beta * c
)
return ans
def test(
lib,
handle,
torch_device,
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride=None,
b_stride=None,
c_stride=None,
dtype=torch.float16,
):
print(
f"Testing Matmul on {torch_device} with a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape}"
f" a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} dtype:{dtype}"
)
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device)
ans = matmul(c, beta, a, b, alpha)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None:
c = rearrange_tensor(c, c_stride)
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib)
descriptor = infiniopMatmulDescriptor_t()
check_error(
lib.infiniopCreateMatmulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
c_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, a.device)
check_error(
lib.infiniopMatmul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
alpha,
beta,
None,
)
)
assert torch.allclose(c, ans, atol=0, rtol=1e-2)
if PROFILE:
for i in range(NUM_PRERUN):
_ = matmul(c, beta, a, b, alpha)
synchronize_device(torch_device)
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = matmul(c, beta, a, b, alpha)
synchronize_device(torch_device)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" pytorch time: {elapsed * 1000 :6f} ms")
for i in range(NUM_PRERUN):
check_error(
lib.infiniopMatmul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
synchronize_device(torch_device)
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopMatmul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
synchronize_device(torch_device)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed * 1000 :6f} ms")
check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = InfiniDeviceEnum.CPU
handle = create_handle(lib, device)
for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"cpu",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = InfiniDeviceEnum.NVIDIA
handle = create_handle(lib, device)
for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"cuda",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = InfiniDeviceEnum.CAMBRICON
handle = create_handle(lib, device)
for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"mlu",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = InfiniDeviceEnum.ASCEND
handle = create_handle(lib, device)
for (
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
) in test_cases:
test(
lib,
handle,
"npu",
alpha,
beta,
a_shape,
b_shape,
c_shape,
a_stride,
b_stride,
c_stride,
dtype,
)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# alpha, beta, a_shape, b_shape, c_shape, a_stride, b_stride, c_stride, dtype
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float16),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), None, None, None, torch.float32),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float16),
(1.0, 0.0, (2, 4, 2048), (2, 2048, 2048), (2, 4, 2048), None, None, None, torch.float32),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float16),
(1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1), torch.float32),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float16),
(1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1), torch.float32),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float16),
(1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None, torch.float32),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateMatmulDescriptor.restype = c_int32
lib.infiniopCreateMatmulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMatmulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t
]
lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
lib.infiniopGetMatmulWorkspaceSize.argtypes = [
infiniopMatmulDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopMatmul.restype = c_int32
lib.infiniopMatmul.argtypes = [
infiniopMatmulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyMatmulDescriptor.restype = c_int32
lib.infiniopDestroyMatmulDescriptor.argtypes = [
infiniopMatmulDescriptor_t,
]
if args.profile:
PROFILE = True
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MaxPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
def pool(x, k, padding, stride, dilation = 1):
pooling_layers = {
1: torch.nn.MaxPool1d,
2: torch.nn.MaxPool2d,
3: torch.nn.MaxPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
ans = pooling_layers[ndim](k, stride=stride, padding=padding, dilation=dilation)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
):
print(
f"Testing MaxPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopMaxPoolDescriptor_t()
check_error(
lib.infiniopCreateMaxPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopMaxPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopMaxPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyMaxPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
lib.infiniopCreateMaxPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMaxPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
infiniopMaxPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMaxPool.restype = c_int32
lib.infiniopMaxPool.argtypes = [
infiniopMaxPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
infiniopMaxPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn as nn
class MLPDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMLPDescriptor_t = POINTER(MLPDescriptor)
def swiglu(a, b):
return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
def mlp(y, x, w12, w3, alpha, residual):
input_dtype = x.dtype
intermediate_size = w3.shape[0]
a = torch.matmul(
x.to(torch.float32), w12[:, intermediate_size:].to(torch.float32)
).to(input_dtype)
b = torch.matmul(
x.to(torch.float32), w12[:, 0:intermediate_size].to(torch.float32)
).to(input_dtype)
c = swiglu(a, b)
d = torch.matmul(c.to(torch.float32), alpha * w3.to(torch.float32)).to(input_dtype)
out = d + y if residual else d
return out
def test(
lib,
handle,
torch_device,
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype=torch.float16,
x_stride=None,
y_stride=None,
w12_stride=None,
w3_stride=None,
):
print(
f"Testing MLP on {torch_device} with num_tokens:{num_tokens} hidden_size:{hidden_size} intermediate_size:{intermediate_size}"
f" alpha:{alpha} residual:{residual} dtype:{dtype} x_stride:{x_stride} y_stride:{y_stride} w12_stride:{w12_stride} w3_stride:{w3_stride}"
)
y = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
x = torch.rand([num_tokens, hidden_size], dtype=dtype).to(torch_device) * 0.01
w12 = (
torch.rand([hidden_size, 2 * intermediate_size], dtype=dtype).to(torch_device)
* 0.01
)
w3 = (
torch.rand([intermediate_size, hidden_size], dtype=dtype).to(torch_device)
* 0.01
)
ans = mlp(y, x, w12, w3, alpha, residual)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
if w12_stride is not None:
w12 = rearrange_tensor(w12, w12_stride)
if w3_stride is not None:
w3 = rearrange_tensor(w3, w3_stride)
y_tensor = to_tensor(y, lib)
x_tensor = to_tensor(x, lib)
w12_tensor = to_tensor(w12, lib)
w3_tensor = to_tensor(w3, lib)
descriptor = infiniopMLPDescriptor_t()
check_error(
lib.infiniopCreateMLPDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w12_tensor.descriptor,
w3_tensor.descriptor,
alpha,
residual,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
y_tensor.descriptor.contents.invalidate()
x_tensor.descriptor.contents.invalidate()
w12_tensor.descriptor.contents.invalidate()
w3_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMLPWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, x.device)
check_error(
lib.infiniopMLP(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
w12_tensor.data,
w3_tensor.data,
None,
)
)
assert torch.allclose(y, ans, atol=0, rtol=2e-2)
check_error(lib.infiniopDestroyMLPDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
num_tokens,
hidden_size,
intermediate_size,
alpha,
residual,
dtype,
x_stride,
y_stride,
w12_stride,
w3_stride,
)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# num_tokens, hidden_size, intermediate_size, alpha, residual, dtype, x_stride, y_stride, w12_stride, w3_stride
(4, 4096, 11008, 1.0, True, torch.float16, None, None, None, None),
(4, 4096, 11008, 1.0, True, torch.float16, [8192, 1], [8192, 1], None, None),
(
4,
4096,
11008,
1.0,
True,
torch.float16,
None,
None,
[1, 4096],
[1, 11008],
),
(4, 4096, 11008, 1.0, False, torch.float16, None, None, None, None),
(4, 4096, 11008, 1.0, False, torch.float16, [8192, 1], [8192, 1], None, None),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateMLPDescriptor.restype = c_int32
lib.infiniopCreateMLPDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMLPDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
c_bool,
]
lib.infiniopGetMLPWorkspaceSize.restype = c_int32
lib.infiniopGetMLPWorkspaceSize.argtypes = [
infiniopMLPDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMLP.restype = c_int32
lib.infiniopMLP.argtypes = [
infiniopMLPDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMLPDescriptor.restype = c_int32
lib.infiniopDestroyMLPDescriptor.argtypes = [
infiniopMLPDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
U64,
)
from operatorspy.tests.test_utils import get_args
import torch
class RandomSampleDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRandomSampleDescriptor_t = POINTER(RandomSampleDescriptor)
def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
indices = torch.zeros([topk], dtype = torch.int64)
dataNp = data.clone().detach()
sorted_indices = torch.arange(voc)
for i in range(topk):
for j in range(i + 1, voc):
if(dataNp[i] < dataNp[j]):
tmp = dataNp[i].clone().detach()
dataNp[i] = dataNp[j].clone().detach()
dataNp[j] = tmp
tmpInd = sorted_indices[i].clone().detach()
sorted_indices[i] = sorted_indices[j].clone().detach()
sorted_indices[j] = tmpInd
#sorted_indices = torch.argsort(dataNp, descending=True)
indices = sorted_indices[:topk]
dataNp = dataNp[sorted_indices]
globalM = dataNp[0]
dataNp = (dataNp - globalM) / temperature
dataNp = torch.softmax(dataNp.float(), dim = 0)
sum_s = 0
for end in range(topk):
sum_s += dataNp[end]
if(sum_s >= topp):
break
if(end < topk - 1):
end += 1
else:
end = topk
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
random_val *= sum_s
sum_s = 0
for i in range(end):
sum_s += dataNp[i]
if(random_val < sum_s):
return indices[i]
def random_sample_0(data):
return torch.argmax(data)
def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
print(
f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
)
data = torch.arange(voc).float() * 0.0001
_perm = torch.randperm(voc)
data = data[_perm].to(x_dtype).to(torch_device)
if(topp > 0 and topk > 1):
ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
else:
ans = random_sample_0(data)
indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
x_tensor = to_tensor(data, lib)
indices_tensor = to_tensor(indices, lib)
indices_tensor.descriptor.contents.dt = U64 # treat int64 as uint64
descriptor = infiniopRandomSampleDescriptor_t()
check_error(
lib.infiniopCreateRandomSampleDescriptor(
handle, ctypes.byref(descriptor), indices_tensor.descriptor, x_tensor.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
indices_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetRandomSampleWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, torch_device)
check_error(
lib.infiniopRandomSample(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
indices_tensor.data,
x_tensor.data,
random_val,
topp,
topk,
temperature,
None,
)
)
if torch_device == "npu":
torch.npu.synchronize()
assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (voc, random_val, topp, topk, temperature) in test_cases:
test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (voc, random_val, topp, topk, temperature) in test_cases:
test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (voc, random_val, topp, topk, temperature) in test_cases:
test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for (voc, random_val, topp, topk, temperature) in test_cases:
test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# voc, random_val, topp, topk, temperature
(512, 0.8, 0.8, 3, 0.5),
(4096, 0.05, 0.9, 5, 1.0),
(16384, 0.15, 0.85, 10, 2.0),
(512, 0.08, 0, 3, 0.5),
(4096, 0.5, 0.9, 1, 1.0),
(16384, 0.15, 0, 1, 2.0),
(16384, 0.15, 0, 1, 2.0),
(32000, 0.08, 0.8, 50, 1.0),
(32000, 0.08, 1.0, 25, 1.0),
# (119696, 0.01, 1.0, 100, 1.0),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
lib.infiniopCreateRandomSampleDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRandomSampleDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
infiniopRandomSampleDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopRandomSample.restype = c_int32
lib.infiniopRandomSample.argtypes = [
infiniopRandomSampleDescriptor_t,
c_void_p,
c_uint64,
c_uint64,
c_void_p,
c_float,
c_float,
c_int32,
c_float,
c_void_p,
]
lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
infiniopRandomSampleDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import ctypes
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
class RerrangeDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopRearrangeDescriptor_t = POINTER(RerrangeDescriptor)
def test(
lib,
handle,
torch_device,
x_shape,
x_stride,
y_shape,
y_stride,
x_dtype=torch.float16,
):
print(
f"Testing Rerrange on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} y_shape:{y_shape} y_stride:{y_stride} x_dtype:{x_dtype}"
)
x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
y = torch.zeros(y_shape, dtype=x_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopRearrangeDescriptor_t()
check_error(
lib.infiniopCreateRearrangeDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
)
assert torch.allclose(x, y, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for test_case in test_cases:
x_shape, x_stride = test_case[0]
y_shape, y_stride = test_case[1]
test(lib, handle, "cpu", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for test_case in test_cases:
x_shape, x_stride = test_case[0]
y_shape, y_stride = test_case[1]
test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for test_case in test_cases:
x_shape, x_stride = test_case[0]
y_shape, y_stride = test_case[1]
test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for test_case in test_cases:
x_shape, x_stride = test_case[0]
y_shape, y_stride = test_case[1]
test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
destroy_handle(lib, handle)
if __name__ == "__main__":
args = get_args()
test_cases = [
# ((src_shape, src_stride), (dst_shape, dst_stride))
(((2, 4, 32), None), ((2, 4, 32), (256, 64, 1))),
(((32, 6, 64), (64, 2560, 1)), ((32, 6, 64), None)),
(((4, 6, 64), (64, 2560, 1)), ((4, 6, 64), (131072, 64, 1))),
(((1, 32, 64), (2048, 64, 1)), ((1, 32, 64), (2048, 64, 1))),
(((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
(((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
(((64,), (1,)), ((64,), (1,))),
]
lib = open_lib()
lib.infiniopCreateRearrangeDescriptor.restype = c_int32
lib.infiniopCreateRearrangeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopRearrangeDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRearrange.restype = c_int32
lib.infiniopRearrange.argtypes = [
infiniopRearrangeDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopRearrangeDescriptor_t]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
class ReluDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopReluDescriptor_t = POINTER(ReluDescriptor)
def relu(x):
if PROFILE:
ans = torch.nn.functional.relu(x).to(x.dtype)
torch.cuda.synchronize()
return ans
return torch.nn.functional.relu(x).to(x.dtype)
def test(
lib,
handle,
torch_device,
tensor_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
):
print(
f"Testing Relu on {torch_device} with tensor_shape:{tensor_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
)
x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
for i in range(NUM_PRERUN if PROFILE else 1):
ans = relu(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = relu(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib) if inplace == Inplace.OUT_OF_PLACE else x_tensor
descriptor = infiniopReluDescriptor_t()
check_error(
lib.infiniopCreateReluDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopRelu(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyReluDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for tensor_shape, inplace in test_cases:
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# tensor_shape, inplace
((), Inplace.OUT_OF_PLACE),
((), Inplace.INPLACE_X),
((1, 3), Inplace.OUT_OF_PLACE),
((3, 3), Inplace.OUT_OF_PLACE),
((3, 3, 13, 9, 17), Inplace.INPLACE_X),
((32, 20, 512), Inplace.INPLACE_X),
((33, 333, 333), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), Inplace.OUT_OF_PLACE),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateReluDescriptor.restype = c_int32
lib.infiniopCreateReluDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopReluDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRelu.restype = c_int32
lib.infiniopRelu.argtypes = [
infiniopReluDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyReluDescriptor.restype = c_int32
lib.infiniopDestroyReluDescriptor.argtypes = [
infiniopReluDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment