Unverified Commit 2790a7b2 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #308 from InfiniTensor/issue/307

issue/307 unify test tensor creation in pytorch tests
parents 70146b74 f62e952e
......@@ -4,15 +4,10 @@
#include "infiniop/handle.h"
#include "infiniop/ops/add.h"
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/global_avg_pool.h"
#include "infiniop/ops/max_pool.h"
#include "infiniop/ops/mlp.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
......
#ifndef __INFINIOP_AVG_POOL_API_H__
#define __INFINIOP_AVG_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
infiniopAvgPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_EXPAND_API_H__
#define __INFINIOP_EXPAND_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopExpandDescriptor_t;
__C __export infiniStatus_t infiniopCreateExpandDescriptor(infiniopHandle_t handle,
infiniopExpandDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopExpand(infiniopExpandDescriptor_t desc,
void *y,
void const *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc);
#endif
#ifndef __INFINIOP_GLOBAL_AVG_POOL_API_H__
#define __INFINIOP_GLOBAL_AVG_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopGlobalAvgPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle,
infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_MAX_POOL_API_H__
#define __INFINIOP_MAX_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
infiniopMaxPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_MLP_API_H__
#define __INFINIOP_MLP_API_H__
#include "../operator_descriptor.h"
#include "gemm.h"
#include "swiglu.h"
typedef struct InfiniopDescriptor *infiniopMLPDescriptor_t;
__C __export infiniStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle,
infiniopMLPDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w12_desc,
infiniopTensorDescriptor_t w3_desc,
float alpha,
char residual);
__C __export infiniStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w12,
const void *w3,
void *stream);
__C __export infiniStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);
#endif
......@@ -13,17 +13,17 @@ def run_tests(args):
failed = []
for test in [
"add.py",
"attention.py",
"causal_softmax.py",
"clip.py",
"gemm.py",
"mul.py",
"random_sample.py",
"rearrange.py",
"rms_norm.py",
"rope.py",
"sub.py",
"swiglu.py",
"attention.py",
"causal_softmax.py",
"rearrange.py",
"mul.py",
]:
result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -58,12 +59,12 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
}
DEBUG = False
......@@ -72,52 +73,13 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AddDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAddDescriptor_t = POINTER(AddDescriptor)
def add(ans, x, y):
torch.add(x, y, out=ans)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def add(c, a, b):
torch.add(a, b, out=c)
def test(
lib,
handle,
torch_device,
device,
shape,
a_stride=None,
b_stride=None,
......@@ -126,58 +88,64 @@ def test(
dtype=torch.float16,
sync=None,
):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if a_stride != c_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device, mode="ones")
if c.is_broadcast():
return
print(
f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
f"Testing Add on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
ans = torch.zeros(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
add(ans, a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopAddDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateAddDescriptor(
LIBINFINIOP.infiniopCreateAddDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetAddWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, c.device)
workspace = TestWorkspace(workspace_size.value, c.device)
def lib_add():
check_error(
lib.infiniopAdd(
LIBINFINIOP.infiniopAdd(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
workspace.data(),
workspace.size(),
c.data(),
a.data(),
b.data(),
None,
)
)
......@@ -186,52 +154,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: add(ans, a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAddDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyAddDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAddDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopAddDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -240,6 +176,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
from ctypes import c_uint64
import ctypes
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from libinfiniop import (
open_lib,
to_tensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
check_error,
rearrange_tensor,
create_workspace,
get_args,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
import torch
class AttentionDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
......@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos):
def test(
lib,
handle,
torch_device,
device,
n_q_head,
n_kv_head,
seq_len,
......@@ -100,94 +93,79 @@ def test(
v_stride=None,
k_cache_stride=None,
v_cache_stride=None,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
f"Testing Attention on {InfiniDeviceNames[device]} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{InfiniDtypeNames[dtype]} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
)
out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device)
q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1
k_cache = (
torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
out = TestTensor([seq_len, n_q_head, head_dim], None, dtype, device, mode="zeros")
q = TestTensor([n_q_head, seq_len, head_dim], q_stride, dtype, device, scale=0.1)
k = TestTensor([n_kv_head, seq_len, head_dim], k_stride, dtype, device, scale=0.1)
v = TestTensor([n_kv_head, seq_len, head_dim], v_stride, dtype, device, scale=0.1)
k_cache = TestTensor(
[n_kv_head, k_cache_buf_len, head_dim], k_cache_stride, dtype, device, scale=0.1
)
v_cache = (
torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device)
* 0.1
v_cache = TestTensor(
[n_kv_head, v_cache_buf_len, head_dim], v_cache_stride, dtype, device, scale=0.1
)
ans = attention(q, k, v, k_cache, v_cache, pos)
if q_stride is not None:
q = rearrange_tensor(q, q_stride)
if k_stride is not None:
k = rearrange_tensor(k, k_stride)
if v_stride is not None:
v = rearrange_tensor(v, v_stride)
if k_cache_stride is not None:
k_cache = rearrange_tensor(k_cache, k_cache_stride)
if v_cache_stride is not None:
v_cache = rearrange_tensor(v_cache, v_cache_stride)
out_tensor = to_tensor(out, lib)
q_tensor = to_tensor(q, lib)
k_tensor = to_tensor(k, lib)
v_tensor = to_tensor(v, lib)
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
def torch_attention():
return attention(
q.torch_tensor(),
k.torch_tensor(),
v.torch_tensor(),
k_cache.torch_tensor(),
v_cache.torch_tensor(),
pos,
)
ans = torch_attention()
if sync is not None:
sync()
descriptor = infiniopAttentionDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateAttentionDescriptor(
LIBINFINIOP.infiniopCreateAttentionDescriptor(
handle,
ctypes.byref(descriptor),
out_tensor.descriptor,
q_tensor.descriptor,
k_tensor.descriptor,
v_tensor.descriptor,
k_cache_tensor.descriptor,
v_cache_tensor.descriptor,
out.descriptor,
q.descriptor,
k.descriptor,
v.descriptor,
k_cache.descriptor,
v_cache.descriptor,
pos,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [
out_tensor,
q_tensor,
k_tensor,
v_tensor,
k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
for tensor in [out, q, k, v, k_cache, v_cache]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetAttentionWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, out.device)
workspace = TestWorkspace(workspace_size.value, out.device)
def lib_attention():
check_error(
lib.infiniopAttention(
LIBINFINIOP.infiniopAttention(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
out_tensor.data,
q_tensor.data,
k_tensor.data,
v_tensor.data,
k_cache_tensor.data,
v_cache_tensor.data,
out.data(),
q.data(),
k.data(),
v.data(),
k_cache.data(),
v_cache.data(),
None,
)
)
......@@ -197,25 +175,25 @@ def test(
# Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(out, ans, atol=atol, rtol=rtol)
assert torch.allclose(out, ans, atol=atol, rtol=rtol)
debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: torch_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyAttentionDescriptor(descriptor))
if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-3},
InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-3},
}
DEBUG = False
......@@ -284,45 +262,6 @@ if __name__ == "__main__":
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAttentionDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_uint64,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopAttentionDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopAttentionDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopAttentionDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -332,5 +271,5 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES)
test_operator(device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation=1):
pooling_layers = {
1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d,
3: torch.nn.AvgPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
x.to(torch.float32)
).to(torch.float16)
else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(
inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
infiniopAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAvgPool.restype = c_int32
lib.infiniopAvgPool.argtypes = [
infiniopAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
infiniopAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -34,13 +35,13 @@ _TEST_CASES_ = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-5},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
}
......@@ -66,13 +67,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class CausalSoftmaxDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
def causal_softmax(x):
type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
......@@ -81,66 +75,57 @@ def causal_softmax(x):
def test(
lib,
handle,
torch_device,
device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}"
f"Testing CausalSoftmax on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
x = torch.rand(shape, dtype=dtype).to(torch_device)
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
x = torch.where(mask == 1, torch.full_like(x, torch.finfo(x.dtype).max), x)
ans = causal_softmax(x)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
x = TestTensor(shape, x_stride, dtype, device)
ans = causal_softmax(x.torch_tensor())
if inplace == Inplace.INPLACE_X:
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
y = TestTensor(shape, x_stride, dtype, device)
if sync is not None:
sync()
descriptor = infiniopCausalSoftmaxDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor
LIBINFINIOP.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.destroyDesc(lib)
x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetCausalSoftmaxWorkspaceSize(
LIBINFINIOP.infiniopGetCausalSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_causal_softmax():
check_error(
lib.infiniopCausalSoftmax(
LIBINFINIOP.infiniopCausalSoftmax(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
y_tensor.data,
x_tensor.data,
y.data(),
x.data(),
None,
)
)
......@@ -152,49 +137,21 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopCausalSoftmaxDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -203,6 +160,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -2,21 +2,22 @@
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
......@@ -51,12 +52,12 @@ _TEST_CASES_ = [
]
_TENSOR_DTYPES = [torch.float16, torch.float32]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
}
......@@ -82,156 +83,108 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ClipDescriptor(Structure):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)]
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def clip(y, x, min_val, max_val):
torch.clamp(x, min_val, max_val, out=y)
def test(
lib,
handle,
torch_device,
device,
shape,
x_stride=None,
y_stride=None,
min_val=-1.0,
max_val=1.0,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32,
dtype=InfiniDtype.F32,
sync=None,
):
print(
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}"
x = TestTensor(shape, x_stride, dtype, device)
min_ = TestTensor(
shape, [0 for _ in shape], dtype, device, mode="zeros", bias=min_val
)
x = torch.rand(shape, dtype=dtype).to(torch_device)
ans = clip(x, min_val, max_val)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
max_ = TestTensor(
shape, [0 for _ in shape], dtype, device, mode="zeros", bias=max_val
)
if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x
y_tensor = x_tensor
else:
y = torch.zeros(shape, dtype=dtype).to(torch_device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
y = TestTensor(shape, y_stride, dtype, device)
if y.is_broadcast():
return
print(
f"Testing Clip on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val)
if sync is not None:
sync()
descriptor = infiniopClipDescriptor_t()
min_, max_ = torch.tensor([min_val], dtype=dtype).to(torch_device), torch.tensor(
[max_val], dtype=dtype
).to(torch_device)
min_tensor = to_tensor(
min_, lib, force_shape=shape, force_strides=[0 for _ in shape]
)
max_tensor = to_tensor(
max_, lib, force_shape=shape, force_strides=[0 for _ in shape]
)
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateClipDescriptor(
LIBINFINIOP.infiniopCreateClipDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
min_tensor.descriptor,
max_tensor.descriptor,
y.descriptor,
x.descriptor,
min_.descriptor,
max_.descriptor,
)
)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetClipWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, x.device)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_clip():
check_error(
lib.infiniopClip(
LIBINFINIOP.infiniopClip(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data() if workspace is not None else None,
workspace_size.value,
y_tensor.data,
x_tensor.data,
min_tensor.data,
max_tensor.data,
y.data(),
x.data(),
min_.data(),
max_.data(),
None,
)
)
lib_clip()
# Now we can destroy the tensor descriptors
x_tensor.destroyDesc(lib)
if inplace != Inplace.INPLACE_X:
y_tensor.destroyDesc(lib)
# Destroy the tensor descriptors
for tensor in [x, y, min_, max_]:
tensor.destroy_desc()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol):
print("\nExpected:")
print(ans)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
......@@ -239,6 +192,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ExpandDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
def expand(x, y):
if PROFILE:
ans = x.expand_as(y).clone()
torch.cuda.synchronize()
return ans
return x.expand_as(y)
def test(
lib,
handle,
torch_device,
y_shape,
x_shape,
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = expand(x, y)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = expand(x, y)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((), (), None, None),
((3, 3), (1,), None, None),
((5, 4, 3), (4, 3,), None, (6, 1)),
((99, 111), (111,), None, None),
((2, 4, 3), (1, 3), None, None),
((2, 20, 3), (2, 1, 3), None, None),
((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateExpandDescriptor.restype = c_int32
lib.infiniopCreateExpandDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopExpandDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopExpand.restype = c_int32
lib.infiniopExpand.argtypes = [
infiniopExpandDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyExpandDescriptor.restype = c_int32
lib.infiniopDestroyExpandDescriptor.argtypes = [
infiniopExpandDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
from ctypes import c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
rearrange_if_needed,
create_workspace,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ==============================================================================
......@@ -31,13 +32,13 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32, torch.bfloat16]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2},
torch.float32: {"atol": 0, "rtol": 1e-3},
torch.bfloat16: {"atol": 0, "rtol": 5e-2},
InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
}
DEBUG = False
......@@ -46,16 +47,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# ==============================================================================
# Definitions
# ==============================================================================
class GemmDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
# PyTorch implementation for matrix multiplication
def gemm(d, _c, beta, _a, _b, alpha):
try:
......@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha):
# The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES
def test(
lib,
handle,
torch_device,
device,
alpha,
beta,
a_shape,
......@@ -84,65 +74,71 @@ def test(
a_stride=None,
b_stride=None,
c_stride=None,
dtype=torch.float16,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta},"
f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
)
# Initialize tensors
a = torch.rand(a_shape, dtype=dtype).to(torch_device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device)
ans = torch.zeros(c_shape, dtype=dtype).to(torch_device)
a = TestTensor(a_shape, a_stride, dtype, device)
b = TestTensor(b_shape, b_stride, dtype, device)
c = TestTensor(c_shape, c_stride, dtype, device, mode="ones")
ans = TestTensor(c_shape, c_stride, dtype, device, mode="zeros")
# Compute the PyTorch reference result
gemm(ans, c, beta, a, b, alpha)
def torch_gemm():
gemm(
ans.torch_tensor(),
c.torch_tensor(),
beta,
a.torch_tensor(),
b.torch_tensor(),
alpha,
)
a, b, c = [
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
torch_gemm()
if sync is not None:
sync()
descriptor = infiniopGemmDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateGemmDescriptor(
LIBINFINIOP.infiniopCreateGemmDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
c.descriptor,
a.descriptor,
b.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
for tensor in [a, b, c]:
tensor.destroy_desc()
# Get workspace size and create workspace
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size))
LIBINFINIOP.infiniopGetGemmWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = create_workspace(workspace_size.value, a.device)
workspace = TestWorkspace(workspace_size.value, device)
# Execute infiniop gemm operator
def lib_gemm():
check_error(
lib.infiniopGemm(
LIBINFINIOP.infiniopGemm(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace.data(),
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
c.data(),
a.data(),
b.data(),
alpha,
beta,
None,
......@@ -155,17 +151,17 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: gemm(ans, c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyGemmDescriptor(descriptor))
check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor))
# ==============================================================================
......@@ -173,40 +169,6 @@ def test(
# ==============================================================================
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGemmDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopGemmDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopGemmDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopGemmDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
......@@ -216,6 +178,6 @@ if __name__ == "__main__":
# Execute tests
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch, time
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GlobalAvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
def inferShape(x):
return x.shape[:2] + (1,) * (x.dim() - 2)
def globalAvgPool(x):
y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
if PROFILE:
torch.cuda.synchronize()
return y.view(*inferShape(x))
def test(
lib,
handle,
torch_device,
x_shape,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = globalAvgPool(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = globalAvgPool(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGlobalAvgPoolWorkspaceSize(
descriptor, ctypes.byref(workspaceSize)
)
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape
((1, 3, 3)),
((1, 3, 1, 1, 3)),
((1, 3, 1, 1, 257)),
((1, 2, 1, 1, 514)),
((1, 3, 1, 1, 1025)),
((32, 256, 1, 112, 112)),
((2, 3, 2048000)),
((2, 1, 10243)),
((2, 20, 100)),
((3, 33, 333)),
((32, 20, 512)),
((3, 3, 11, 11, 11, 3, 2)),
((32, 256, 1, 112, 112)),
((32, 256, 112, 112)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGlobalAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGlobalAvgPool.restype = c_int32
lib.infiniopGlobalAvgPool.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
......@@ -4,10 +4,11 @@ import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
from .liboperators import (
open_lib,
CTensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
LIBINFINIOP,
)
from .devices import *
from .utils import *
from .datatypes import *
from .structs import *
......@@ -19,3 +19,27 @@ class InfiniDtype:
C32 = 17
C64 = 18
BF16 = 19
InfiniDtypeNames = {
InfiniDtype.INVALID: "INVALID",
InfiniDtype.BYTE: "BYTE",
InfiniDtype.BOOL: "BOOL",
InfiniDtype.I8: "I8",
InfiniDtype.I16: "I16",
InfiniDtype.I32: "I32",
InfiniDtype.I64: "I64",
InfiniDtype.U8: "U8",
InfiniDtype.U16: "U16",
InfiniDtype.U32: "U32",
InfiniDtype.U64: "U64",
InfiniDtype.F8: "F8",
InfiniDtype.F16: "F16",
InfiniDtype.F32: "F32",
InfiniDtype.F64: "F64",
InfiniDtype.C8: "C8",
InfiniDtype.C16: "C16",
InfiniDtype.C32: "C32",
InfiniDtype.C64: "C64",
InfiniDtype.BF16: "BF16",
}
......@@ -10,8 +10,20 @@ class InfiniDeviceEnum:
SUGON = 8
InfiniDeviceNames = {
InfiniDeviceEnum.CPU: "CPU",
InfiniDeviceEnum.NVIDIA: "NVIDIA",
InfiniDeviceEnum.CAMBRICON: "Cambricon",
InfiniDeviceEnum.ASCEND: "Ascend",
InfiniDeviceEnum.METAX: "Metax",
InfiniDeviceEnum.MOORE: "Moore",
InfiniDeviceEnum.ILUVATAR: "Iluvatar",
InfiniDeviceEnum.KUNLUN: "Kunlun",
InfiniDeviceEnum.SUGON: "Sugon",
}
# Mapping that maps InfiniDeviceEnum to torch device string
infiniDeviceEnum_str_map = {
torch_device_map = {
InfiniDeviceEnum.CPU: "cpu",
InfiniDeviceEnum.NVIDIA: "cuda",
InfiniDeviceEnum.CAMBRICON: "mlu",
......
import os
import platform
import ctypes
from ctypes import c_int, c_int64, c_uint64, Structure, POINTER
from ctypes import c_int, c_int64, c_uint64, POINTER
from .datatypes import *
from .devices import *
from .op_register import OpRegister
from pathlib import Path
Device = c_int
Optype = c_int
from .structs import *
INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini")
class TensorDescriptor(Structure):
_fields_ = []
infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
class CTensor:
def __init__(self, desc, torch_tensor):
self.descriptor = desc
self.torch_tensor_ = torch_tensor
self.data = torch_tensor.data_ptr()
def destroyDesc(self, lib_):
lib_.infiniopDestroyTensorDescriptor(self.descriptor)
self.descriptor = None
class Handle(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopHandle_t = POINTER(Handle)
class InfiniLib:
def __init__(self, librt, libop):
self.librt = librt
......@@ -98,4 +72,9 @@ def open_lib():
lib.infinirtSetDevice.argtypes = [c_int, c_int]
lib.infinirtSetDevice.restype = c_int
OpRegister.register_lib(lib)
return lib
LIBINFINIOP = open_lib()
from .structs import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
infiniopOperatorDescriptor_t,
)
from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
class OpRegister:
registry = []
@classmethod
def operator(cls, op):
cls.registry.append(op)
return op
@classmethod
def register_lib(cls, lib):
for op in cls.registry:
op(lib)
@OpRegister.operator
def add_(lib):
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def attention_(lib):
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_size_t,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def causal_softmax_(lib):
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def clip_(lib):
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def conv_(lib):
pass
@OpRegister.operator
def gemm_(lib):
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def mul_(lib):
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def random_sample_(lib):
lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
lib.infiniopCreateRandomSampleDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRandomSample.restype = c_int32
lib.infiniopRandomSample.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_size_t,
c_void_p,
c_float,
c_float,
c_int32,
c_float,
c_void_p,
]
lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rearrange_(lib):
lib.infiniopCreateRearrangeDescriptor.restype = c_int32
lib.infiniopCreateRearrangeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRearrange.restype = c_int32
lib.infiniopRearrange.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def rms_norm_(lib):
lib.infiniopCreateRMSNormDescriptor.restype = c_int32
lib.infiniopCreateRMSNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRMSNorm.restype = c_int32
lib.infiniopRMSNorm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
lib.infiniopDestroyRMSNormDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rope_(lib):
lib.infiniopCreateRoPEDescriptor.restype = c_int32
lib.infiniopCreateRoPEDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
lib.infiniopGetRoPEWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRoPE.restype = c_int32
lib.infiniopRoPE.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEDescriptor.restype = c_int32
lib.infiniopDestroyRoPEDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def sub_(lib):
lib.infiniopCreateSubDescriptor.restype = c_int32
lib.infiniopCreateSubDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSubWorkspaceSize.restype = c_int32
lib.infiniopGetSubWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSub.restype = c_int32
lib.infiniopSub.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySubDescriptor.restype = c_int32
lib.infiniopDestroySubDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def swiglu_(lib):
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
lib.infiniopCreateSwiGLUDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment