Commit 46da1a27 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: cpu and cuda matmul

parents
import libinfiniop
\ No newline at end of file
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
from enum import Enum, auto
import torch
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
class AddDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAddDescriptor_t = POINTER(AddDescriptor)
def add(x, y):
return torch.add(x, y)
def test(
lib,
handle,
torch_device,
c_shape,
a_shape,
b_shape,
tensor_dtype=torch.float16,
inplace=Inplace.OUT_OF_PLACE,
):
print(
f"Testing Add on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}"
)
if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE:
print("Unsupported test: broadcasting does not support in-place")
return
a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
ans = add(a, b)
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
descriptor = infiniopAddDescriptor_t()
check_error(
lib.infiniopCreateAddDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
c_tensor.descriptor.contents.invalidate()
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
check_error(
lib.infiniopAdd(descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None)
)
assert torch.allclose(c, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for c_shape, a_shape, b_shape, inplace in test_cases:
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# c_shape, a_shape, b_shape, inplace
# ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
# ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE),
((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE),
((), (), (), Inplace.OUT_OF_PLACE),
((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE),
((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A),
((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B),
((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE),
((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAddDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
CTensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
import torch.nn.functional as F
class AttentionDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
y = x.clone()
masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
return torch.nn.functional.softmax(masked, dim=-1).to(type)
def attention(q, k, v, k_cache, v_cache, pos):
type = q.dtype
n_q_head = q.shape[0]
n_kv_head = k.shape[0]
# Concatenate key and value caches
k_cache = k_cache[:, :pos, :] # (n_kv_head, pos, head_dim)
v_cache = v_cache[:, :pos, :] # (n_kv_head, pos, head_dim)
k = torch.cat([k_cache, k], dim=1) # (n_kv_head, total_seq_len, head_dim)
v = torch.cat([v_cache, v], dim=1) # (n_kv_head, total_seq_len, head_dim)
total_seq_len = k.shape[1]
head_dim = v.shape[-1]
if n_q_head != n_kv_head:
q = q.reshape(
n_kv_head, -1, head_dim
) # (n_kv_head, n_group * seq_len, head_dim)
# Scaled dot-product attention
attn_scores = (
torch.einsum("hqd,hkd->hqk", q.to(torch.float32), k.to(torch.float32))
.to(type)
.reshape(n_q_head, -1, total_seq_len)
) # (n_q_head, seq_len, total_seq_len)
attn_scores = attn_scores / (head_dim**0.5)
attn_weights = causal_softmax(attn_scores).reshape(
n_kv_head, -1, total_seq_len
) # (n_kv_head, seq_len, total_seq_len)
# Weighted sum of values
attn_output = (
torch.einsum(
"hqk,hkd->hqd", attn_weights.to(torch.float32), v.to(torch.float32)
)
.to(type)
.reshape(n_q_head, -1, head_dim)
.permute(1, 0, 2)
) # ([seq_len, n_q_head, head_dim])
return attn_output
def test(
lib,
handle,
torch_device,
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype=torch.float16,
q_stride=None,
k_stride=None,
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
)
out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k_cache = (
torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
)
v_cache = (
torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
)
ans = attention(q, k, v, k_cache, v_cache, pos)
if q_stride is not None:
q = rearrange_tensor(q, q_stride)
if k_stride is not None:
k = rearrange_tensor(k, k_stride)
if v_stride is not None:
v = rearrange_tensor(v, v_stride)
if k_cache_stride is not None:
k_cache = rearrange_tensor(k_cache, k_cache_stride)
if v_cache_stride is not None:
v_cache = rearrange_tensor(v_cache, v_cache_stride)
out_tensor = to_tensor(out, lib)
q_tensor = to_tensor(q, lib)
k_tensor = to_tensor(k, lib)
v_tensor = to_tensor(v, lib)
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
descriptor = infiniopAttentionDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
handle,
ctypes.byref(descriptor),
out_tensor.descriptor,
q_tensor.descriptor,
k_tensor.descriptor,
v_tensor.descriptor,
k_cache_tensor.descriptor,
v_cache_tensor.descriptor,
pos,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
out_tensor.descriptor.contents.invalidate()
q_tensor.descriptor.contents.invalidate()
k_tensor.descriptor.contents.invalidate()
v_tensor.descriptor.contents.invalidate()
k_cache_tensor.descriptor.contents.invalidate()
v_cache_tensor.descriptor.contents.invalidate()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, out.device)
check_error(
lib.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
None,
)
)
assert torch.allclose(out, ans, atol=1e-4, rtol=1e-2)
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cpu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"cuda",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
) in test_cases:
test(
lib,
handle,
"mlu",
n_q_head,
n_kv_head,
seq_len,
head_dim,
pos,
k_cache_buf_len,
v_cache_buf_len,
dtype,
q_stride,
k_stride,
v_stride,
k_cache_stride,
v_cache_stride,
)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# prefill
(
32, # n_q_head
4, # n_kv_head
5, # seq_len
64, # head_dim
0, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
[64, 11264, 1], # k_cache_stride
[64, 11264, 1], # v_cache_stride
),
# decode
(
32, # n_q_head
4, # n_kv_head
1, # seq_len
64, # head_dim
3, # pos
2048, # k_cache_buf_len
2048, # v_cache_buf_len
torch.float16, # dtype
[64, 2560, 1], # q_stride
[64, 2560, 1], # k_stride
[64, 2560, 1], # v_stride
[64, 11264, 1], # k_cache_stride
[64, 11264, 1], # v_cache_stride
),
# for test
(
8, # n_q_head
4, # n_kv_head
2, # seq_len
16, # head_dim
1, # pos
8, # k_cache_buf_len
8, # v_cache_buf_len
torch.float16, # dtype
None, # q_stride
None, # k_stride
None, # v_stride
None, # k_cache_stride
None, # v_cache_stride
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAttentionDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_uint64,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopAttentionDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopAttentionDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopAttentionDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation = 1):
pooling_layers = {
1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d,
3: torch.nn.AvgPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
infiniopAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAvgPool.restype = c_int32
lib.infiniopAvgPool.argtypes = [
infiniopAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
infiniopAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
create_workspace,
)
from operatorspy.tests.test_utils import get_args
import torch
class CausalSoftmaxDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
y = x.clone()
masked = torch.where(mask == 1, -torch.inf, y.to(torch.float32))
return torch.nn.functional.softmax(masked, dim=-1).to(type)
def test(lib, handle, torch_device, x_shape, x_stride=None, x_dtype=torch.float16):
print(
f"Testing CausalSoftmax on {torch_device} with x_shape:{x_shape} x_stride:{x_stride} dtype:{x_dtype}"
)
x = torch.rand(x_shape, dtype=x_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
ans = causal_softmax(x)
x_tensor = to_tensor(x, lib)
descriptor = infiniopCausalSoftmaxDescriptor_t()
check_error(
lib.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), x_tensor.descriptor
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetCausalSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
workspace = create_workspace(workspace_size.value, x.device)
check_error(
lib.infiniopCausalSoftmax(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
x_tensor.data,
None,
)
)
assert torch.allclose(x, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "cpu", x_shape, x_stride)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "cuda", x_shape, x_stride)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "mlu", x_shape, x_stride)
destroy_handle(lib, handle)
def test_ascend(lib, test_cases):
import torch_npu
device = DeviceEnum.DEVICE_ASCEND
handle = create_handle(lib, device)
for x_shape, x_stride in test_cases:
test(lib, handle, "npu", x_shape, x_stride)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, x_stride
((32, 20, 512), None),
((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
]
args = get_args()
lib = open_lib()
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopCausalSoftmaxDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
import math
import ctypes
from torch.nn import functional as F
from typing import List, Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ConvDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopConvDescriptor_t = POINTER(ConvDescriptor)
def conv(x, w, stride, padding, dilation):
match len(x.shape) - 2:
case 1:
return F.conv1d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case 2:
return F.conv2d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case 3:
return F.conv3d(
x, w, stride=stride, padding=padding, dilation=dilation
)
case _:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
# infer the shape of the output given the inputs for a N-ary convolution
def inferShape(
x_shape: List[int],
w_shape: List[int],
pads: List[int],
strides: List[int],
dilations: List[int],
) -> Tuple[int, ...]:
assert (
len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
output_dims = [
math.floor(
(x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
/ strides[i]
+ 1
)
for i in range(len(pads))
]
return (x_shape[0], w_shape[0]) + tuple(output_dims)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
w_shape,
pads,
strides,
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
):
assert len(pads) == len(strides) == len(dilations)
print(
f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(
inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = conv(x, w, strides, pads, dilations)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = conv(x, w, strides, pads, dilations)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopConvDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w_tensor.descriptor,
tuple_to_void_p(pads),
tuple_to_void_p(strides),
tuple_to_void_p(dilations),
len(pads),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
w_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
if (tensor_dtype == torch.float16):
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
else:
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyConvDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape, w_shape, pads, strides, dilations, x_strides
(
(32, 3, 4),
(32, 3, 5),
(1,),
(1,),
(1,),
None,
),
(
(1, 3, 4, 4),
(2, 3, 3, 3),
(1, 1),
(1, 2),
(2, 1),
None,
),
(
(32, 3, 128, 128),
(64, 3, 5, 5),
(2, 2),
(2, 2),
(1, 1),
None,
),
(
(1, 1, 4, 4, 4),
(1, 1, 5, 5, 5),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
None,
),
(
(32, 3, 32, 32, 32),
(64, 3, 5, 5, 5),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
None,
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopConvDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopConvDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopConvDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ExpandDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
def expand(x, y):
if PROFILE:
ans = x.expand_as(y).clone()
torch.cuda.synchronize()
return ans
return x.expand_as(y)
def test(
lib,
handle,
torch_device,
y_shape,
x_shape,
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = expand(x, y)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = expand(x, y)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# y_shape, x_shape, y_stride, x_stride
((), (), None, None),
((3, 3), (1,), None, None),
((5, 4, 3), (4, 3,), None, (6, 1)),
((99, 111), (111,), None, None),
((2, 4, 3), (1, 3), None, None),
((2, 20, 3), (2, 1, 3), None, None),
((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateExpandDescriptor.restype = c_int32
lib.infiniopCreateExpandDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopExpandDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopExpand.restype = c_int32
lib.infiniopExpand.argtypes = [
infiniopExpandDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyExpandDescriptor.restype = c_int32
lib.infiniopDestroyExpandDescriptor.argtypes = [
infiniopExpandDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float, c_bool
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GEMMDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
A = A.T if transA else A
B = B.T if transB else B
result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
if C is not None:
result += beta * C if dtype != torch.float16 else C.to(torch.float32)
if PROFILE:
torch.cuda.synchronize()
return result
def test(
lib,
handle,
torch_device,
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride=None,
b_stride=None,
c_stride=None,
y_stride=None,
dtype=torch.float16,
):
print(
f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
)
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.rand(c_shape, dtype=dtype).to(torch_device) if c_shape else None
y = torch.rand(y_shape, dtype=dtype).to(torch_device)
if a_stride is not None:
a = rearrange_tensor(a, a_stride)
if b_stride is not None:
b = rearrange_tensor(b, b_stride)
if c_stride is not None and c is not None:
c = rearrange_tensor(c, c_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = gemm(a, b, c, transA, transB, alpha, beta, dtype)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = gemm(a, b, c, transA, transB, alpha, beta, dtype)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
a_tensor = to_tensor(a, lib)
b_tensor = to_tensor(b, lib)
c_tensor = to_tensor(c, lib) if c is not None else None
y_tensor = to_tensor(y, lib)
descriptor = infiniopGEMMDescriptor_t()
check_error(
lib.infiniopCreateGEMMDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c_tensor.descriptor if c_tensor else None,
alpha,
beta,
transA,
transB,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
a_tensor.descriptor.contents.invalidate()
b_tensor.descriptor.contents.invalidate()
if c_tensor is not None:
c_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspace_size = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGEMMWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGEMM(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGEMM(
descriptor,
workspace_ptr,
workspace_size,
y_tensor.data,
a_tensor.data,
b_tensor.data,
c_tensor.data if c_tensor else None,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
check_error(lib.infiniopDestroyGEMMDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for (
alpha,
beta,
transA,
transB,
a_shape,
b_shape,
c_shape,
y_shape,
a_stride,
b_stride,
c_stride,
y_stride,
) in test_cases:
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride
(
1.0,
1.0,
False,
False,
(1, 2048),
(2048, 2048),
(1, 2048),
(1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
True,
(2048, 4),
(2048, 2048),
(4, 2048),
(4, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
False,
True,
(1, 2048),
(1000, 2048),
(1000),
(1, 1000),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
(2048),
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
(
1.0,
1.0,
False,
False,
(3, 1, 2048),
(3, 2048, 2048),
(1,),
(3, 1, 2048),
None,
None,
None,
None,
),
(
1.0,
1.0,
True,
False,
(2048, 4),
(2048, 2048),
None,
(4, 2048),
(4096, 1),
(4096, 1),
(2,),
(4096, 1),
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGEMMDescriptor.restype = c_int32
lib.infiniopCreateGEMMDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGEMMDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
c_float,
c_bool,
c_bool,
]
lib.infiniopGetGEMMWorkspaceSize.restype = c_int32
lib.infiniopGetGEMMWorkspaceSize.argtypes = [
infiniopGEMMDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGEMM.restype = c_int32
lib.infiniopGEMM.argtypes = [
infiniopGEMMDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGEMMDescriptor.restype = c_int32
lib.infiniopDestroyGEMMDescriptor.argtypes = [
infiniopGEMMDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch, time
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GlobalAvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
def inferShape(x):
return x.shape[:2] + (1,) * (x.dim() - 2)
def globalAvgPool(x):
y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
if PROFILE:
torch.cuda.synchronize()
return y.view(*inferShape(x))
def test(
lib,
handle,
torch_device,
x_shape,
tensor_dtype=torch.float16,
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = globalAvgPool(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = globalAvgPool(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGlobalAvgPoolWorkspaceSize(
descriptor, ctypes.byref(workspaceSize)
)
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGlobalAvgPool(
descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape
((1, 3, 3)),
((1, 3, 1, 1, 3)),
((1, 3, 1, 1, 257)),
((1, 2, 1, 1, 514)),
((1, 3, 1, 1, 1025)),
((32, 256, 1, 112, 112)),
((2, 3, 2048000)),
((2, 1, 10243)),
((2, 20, 100)),
((3, 33, 333)),
((32, 20, 512)),
((3, 3, 11, 11, 11, 3, 2)),
((32, 256, 1, 112, 112)),
((32, 256, 112, 112)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGlobalAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGlobalAvgPool.restype = c_int32
lib.infiniopGlobalAvgPool.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
from .devices import *
from .utils import *
from .datatypes import *
class InfiniDtype:
INVALID = 0
BYTE = 1
BOOL = 2
I8 = 3
I16 = 4
I32 = 5
I64 = 6
U8 = 7
U16 = 8
U32 = 9
U64 = 10
F8 = 11
F16 = 12
F32 = 13
F64 = 14
C8 = 15
C16 = 16
C32 = 17
C64 = 18
BF16 = 19
class InfiniDeviceEnum:
CPU = 0
NVIDIA = 1
CAMBRICON = 2
ASCEND = 3
METAX = 4,
MOORE = 5,
ILUVATAR = 6,
KUNLUN = 7,
SUGON = 8,
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment