Unverified Commit 2790a7b2 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #308 from InfiniTensor/issue/307

issue/307 unify test tensor creation in pytorch tests
parents 70146b74 f62e952e
...@@ -4,15 +4,10 @@ ...@@ -4,15 +4,10 @@
#include "infiniop/handle.h" #include "infiniop/handle.h"
#include "infiniop/ops/add.h" #include "infiniop/ops/add.h"
#include "infiniop/ops/attention.h" #include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h" #include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h" #include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h" #include "infiniop/ops/gemm.h"
#include "infiniop/ops/global_avg_pool.h"
#include "infiniop/ops/max_pool.h"
#include "infiniop/ops/mlp.h"
#include "infiniop/ops/mul.h" #include "infiniop/ops/mul.h"
#include "infiniop/ops/random_sample.h" #include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h" #include "infiniop/ops/rearrange.h"
......
#ifndef __INFINIOP_AVG_POOL_API_H__
#define __INFINIOP_AVG_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
infiniopAvgPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_EXPAND_API_H__
#define __INFINIOP_EXPAND_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopExpandDescriptor_t;
__C __export infiniStatus_t infiniopCreateExpandDescriptor(infiniopHandle_t handle,
infiniopExpandDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopExpand(infiniopExpandDescriptor_t desc,
void *y,
void const *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyExpandDescriptor(infiniopExpandDescriptor_t desc);
#endif
#ifndef __INFINIOP_GLOBAL_AVG_POOL_API_H__
#define __INFINIOP_GLOBAL_AVG_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopGlobalAvgPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateGlobalAvgPoolDescriptor(infiniopHandle_t handle,
infiniopGlobalAvgPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopGetGlobalAvgPoolWorkspaceSize(infiniopGlobalAvgPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopGlobalAvgPool(infiniopGlobalAvgPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyGlobalAvgPoolDescriptor(infiniopGlobalAvgPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_MAX_POOL_API_H__
#define __INFINIOP_MAX_POOL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
__C __export infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
infiniopMaxPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
void *workspace, size_t workspace_size,
void *y, void const *x, void *stream);
__C __export infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
#endif
#ifndef __INFINIOP_MLP_API_H__
#define __INFINIOP_MLP_API_H__
#include "../operator_descriptor.h"
#include "gemm.h"
#include "swiglu.h"
typedef struct InfiniopDescriptor *infiniopMLPDescriptor_t;
__C __export infiniStatus_t infiniopCreateMLPDescriptor(infiniopHandle_t handle,
infiniopMLPDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w12_desc,
infiniopTensorDescriptor_t w3_desc,
float alpha,
char residual);
__C __export infiniStatus_t infiniopGetMLPWorkspaceSize(infiniopMLPDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w12,
const void *w3,
void *stream);
__C __export infiniStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);
#endif
...@@ -13,17 +13,17 @@ def run_tests(args): ...@@ -13,17 +13,17 @@ def run_tests(args):
failed = [] failed = []
for test in [ for test in [
"add.py", "add.py",
"attention.py",
"causal_softmax.py",
"clip.py", "clip.py",
"gemm.py", "gemm.py",
"mul.py",
"random_sample.py", "random_sample.py",
"rearrange.py",
"rms_norm.py", "rms_norm.py",
"rope.py", "rope.py",
"sub.py", "sub.py",
"swiglu.py", "swiglu.py",
"attention.py",
"causal_softmax.py",
"rearrange.py",
"mul.py",
]: ]:
result = subprocess.run( result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
import torch import torch
import ctypes import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64 from ctypes import c_uint64
from libinfiniop import ( from libinfiniop import (
infiniopHandle_t, LIBINFINIOP,
infiniopTensorDescriptor_t, TestTensor,
open_lib,
to_tensor,
get_test_devices, get_test_devices,
check_error, check_error,
rearrange_if_needed,
test_operator, test_operator,
get_args, get_args,
debug, debug,
get_tolerance, get_tolerance,
profile_operation, profile_operation,
create_workspace, TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
) )
from enum import Enum, auto from enum import Enum, auto
...@@ -58,12 +59,12 @@ _TEST_CASES = [ ...@@ -58,12 +59,12 @@ _TEST_CASES = [
] ]
# Data types used for testing # Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32] _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3}, InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-7}, InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
} }
DEBUG = False DEBUG = False
...@@ -72,52 +73,13 @@ NUM_PRERUN = 10 ...@@ -72,52 +73,13 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000 NUM_ITERATIONS = 1000
class AddDescriptor(Structure): def add(c, a, b):
_fields_ = [("device", c_int32)] torch.add(a, b, out=c)
infiniopAddDescriptor_t = POINTER(AddDescriptor)
def add(ans, x, y):
torch.add(x, y, out=ans)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test( def test(
lib,
handle, handle,
torch_device, device,
shape, shape,
a_stride=None, a_stride=None,
b_stride=None, b_stride=None,
...@@ -126,58 +88,64 @@ def test( ...@@ -126,58 +88,64 @@ def test(
dtype=torch.float16, dtype=torch.float16,
sync=None, sync=None,
): ):
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
if inplace == Inplace.INPLACE_A:
if a_stride != c_stride:
return
c = a
elif inplace == Inplace.INPLACE_B:
if c_stride != b_stride:
return
c = b
else:
c = TestTensor(shape, c_stride, dtype, device, mode="ones")
if c.is_broadcast():
return
print( print(
f"Testing Add on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " f"Testing Add on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}" f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
) )
a = torch.rand(shape, dtype=dtype).to(torch_device) add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
ans = torch.zeros(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
add(ans, a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopAddDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
check_error( check_error(
lib.infiniopCreateAddDescriptor( LIBINFINIOP.infiniopCreateAddDescriptor(
handle, handle,
ctypes.byref(descriptor), ctypes.byref(descriptor),
c_tensor.descriptor, c.descriptor,
a_tensor.descriptor, a.descriptor,
b_tensor.descriptor, b.descriptor,
) )
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]: for tensor in [a, b, c]:
tensor.destroyDesc(lib) tensor.destroy_desc()
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopGetAddWorkspaceSize(descriptor, ctypes.byref(workspace_size)) LIBINFINIOP.infiniopGetAddWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
) )
workspace = create_workspace(workspace_size.value, c.device) workspace = TestWorkspace(workspace_size.value, c.device)
def lib_add(): def lib_add():
check_error( check_error(
lib.infiniopAdd( LIBINFINIOP.infiniopAdd(
descriptor, descriptor,
workspace.data_ptr() if workspace is not None else None, workspace.data(),
workspace_size.value, workspace.size(),
c_tensor.data, c.data(),
a_tensor.data, a.data(),
b_tensor.data, b.data(),
None, None,
) )
) )
...@@ -186,52 +154,20 @@ def test( ...@@ -186,52 +154,20 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
debug(c, ans, atol=atol, rtol=rtol) debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol) assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
# fmt: off # fmt: off
profile_operation("PyTorch", lambda: add(ans, a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation("PyTorch", lambda: add(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_add(), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_add(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
check_error(lib.infiniopDestroyAddDescriptor(descriptor)) check_error(LIBINFINIOP.infiniopDestroyAddDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAddDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopAddDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopAddDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopAddDescriptor_t,
]
# Configure testing options # Configure testing options
DEBUG = args.debug DEBUG = args.debug
...@@ -240,6 +176,6 @@ if __name__ == "__main__": ...@@ -240,6 +176,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p from ctypes import c_uint64
import ctypes import ctypes
import sys import sys
import os import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from libinfiniop import ( from libinfiniop import (
open_lib, LIBINFINIOP,
to_tensor, TestTensor,
infiniopHandle_t,
infiniopTensorDescriptor_t,
check_error,
rearrange_tensor,
create_workspace,
get_args,
get_test_devices, get_test_devices,
check_error,
test_operator, test_operator,
get_args,
debug, debug,
get_tolerance, get_tolerance,
profile_operation, profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
) )
import torch import torch
class AttentionDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAttentionDescriptor_t = POINTER(AttentionDescriptor)
def causal_softmax(x): def causal_softmax(x):
type = x.dtype type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1]) mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
...@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos): ...@@ -85,9 +79,8 @@ def attention(q, k, v, k_cache, v_cache, pos):
def test( def test(
lib,
handle, handle,
torch_device, device,
n_q_head, n_q_head,
n_kv_head, n_kv_head,
seq_len, seq_len,
...@@ -100,94 +93,79 @@ def test( ...@@ -100,94 +93,79 @@ def test(
v_stride=None, v_stride=None,
k_cache_stride=None, k_cache_stride=None,
v_cache_stride=None, v_cache_stride=None,
dtype=torch.float16, dtype=InfiniDtype.F16,
sync=None, sync=None,
): ):
print( print(
f"Testing Attention on {torch_device} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} " f"Testing Attention on {InfiniDeviceNames[device]} with n_q_head:{n_q_head} n_kv_head:{n_kv_head} seq_len:{seq_len} head_dim:{head_dim} pos:{pos} "
f"dtype:{dtype} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}" f"dtype:{InfiniDtypeNames[dtype]} q_stride:{q_stride} k_stride:{k_stride} v_stride:{v_stride} k_cache_stride:{k_cache_stride} v_cache_stride:{v_cache_stride}"
) )
out = torch.zeros([seq_len, n_q_head, head_dim], dtype=dtype, device=torch_device) out = TestTensor([seq_len, n_q_head, head_dim], None, dtype, device, mode="zeros")
q = torch.rand([n_q_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 q = TestTensor([n_q_head, seq_len, head_dim], q_stride, dtype, device, scale=0.1)
k = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 k = TestTensor([n_kv_head, seq_len, head_dim], k_stride, dtype, device, scale=0.1)
v = torch.rand([n_kv_head, seq_len, head_dim], dtype=dtype).to(torch_device) * 0.1 v = TestTensor([n_kv_head, seq_len, head_dim], v_stride, dtype, device, scale=0.1)
k_cache = ( k_cache = TestTensor(
torch.rand([n_kv_head, k_cache_buf_len, head_dim], dtype=dtype).to(torch_device) [n_kv_head, k_cache_buf_len, head_dim], k_cache_stride, dtype, device, scale=0.1
* 0.1
) )
v_cache = ( v_cache = TestTensor(
torch.rand([n_kv_head, v_cache_buf_len, head_dim], dtype=dtype).to(torch_device) [n_kv_head, v_cache_buf_len, head_dim], v_cache_stride, dtype, device, scale=0.1
* 0.1
) )
ans = attention(q, k, v, k_cache, v_cache, pos) def torch_attention():
return attention(
if q_stride is not None: q.torch_tensor(),
q = rearrange_tensor(q, q_stride) k.torch_tensor(),
if k_stride is not None: v.torch_tensor(),
k = rearrange_tensor(k, k_stride) k_cache.torch_tensor(),
if v_stride is not None: v_cache.torch_tensor(),
v = rearrange_tensor(v, v_stride) pos,
if k_cache_stride is not None: )
k_cache = rearrange_tensor(k_cache, k_cache_stride)
if v_cache_stride is not None: ans = torch_attention()
v_cache = rearrange_tensor(v_cache, v_cache_stride)
out_tensor = to_tensor(out, lib)
q_tensor = to_tensor(q, lib)
k_tensor = to_tensor(k, lib)
v_tensor = to_tensor(v, lib)
k_cache_tensor = to_tensor(k_cache, lib)
v_cache_tensor = to_tensor(v_cache, lib)
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopAttentionDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
check_error( check_error(
lib.infiniopCreateAttentionDescriptor( LIBINFINIOP.infiniopCreateAttentionDescriptor(
handle, handle,
ctypes.byref(descriptor), ctypes.byref(descriptor),
out_tensor.descriptor, out.descriptor,
q_tensor.descriptor, q.descriptor,
k_tensor.descriptor, k.descriptor,
v_tensor.descriptor, v.descriptor,
k_cache_tensor.descriptor, k_cache.descriptor,
v_cache_tensor.descriptor, v_cache.descriptor,
pos, pos,
) )
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [ for tensor in [out, q, k, v, k_cache, v_cache]:
out_tensor, tensor.destroy_desc()
q_tensor,
k_tensor,
v_tensor,
k_cache_tensor,
v_cache_tensor,
]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopGetAttentionWorkspaceSize(descriptor, ctypes.byref(workspace_size)) LIBINFINIOP.infiniopGetAttentionWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
) )
workspace = create_workspace(workspace_size.value, out.device) workspace = TestWorkspace(workspace_size.value, out.device)
def lib_attention(): def lib_attention():
check_error( check_error(
lib.infiniopAttention( LIBINFINIOP.infiniopAttention(
descriptor, descriptor,
workspace.data_ptr() if workspace is not None else None, workspace.data(),
workspace_size.value, workspace_size.value,
out_tensor.data, out.data(),
q_tensor.data, q.data(),
k_tensor.data, k.data(),
v_tensor.data, v.data(),
k_cache_tensor.data, k_cache.data(),
v_cache_tensor.data, v_cache.data(),
None, None,
) )
) )
...@@ -197,25 +175,25 @@ def test( ...@@ -197,25 +175,25 @@ def test(
# Validate results # Validate results
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
debug(out, ans, atol=atol, rtol=rtol) debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(out, ans, atol=atol, rtol=rtol) assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
# fmt: off # fmt: off
profile_operation("PyTorch", lambda: attention(q, k, v, k_cache, v_cache, pos), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation("PyTorch", lambda: torch_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_attention(), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_attention(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
check_error(lib.infiniopDestroyAttentionDescriptor(descriptor)) check_error(LIBINFINIOP.infiniopDestroyAttentionDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
_TENSOR_DTYPES = [torch.float16, torch.float32] _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2}, InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-3}, InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-3},
} }
DEBUG = False DEBUG = False
...@@ -284,45 +262,6 @@ if __name__ == "__main__": ...@@ -284,45 +262,6 @@ if __name__ == "__main__":
), ),
] ]
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAttentionDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_uint64,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopAttentionDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopAttentionDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopAttentionDescriptor_t,
]
# Configure testing options # Configure testing options
DEBUG = args.debug DEBUG = args.debug
...@@ -332,5 +271,5 @@ if __name__ == "__main__": ...@@ -332,5 +271,5 @@ if __name__ == "__main__":
# Execute tests # Execute tests
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, test_cases, _TENSOR_DTYPES) test_operator(device, test, test_cases, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch
from typing import Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class AvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
def pool(x, k, padding, stride, dilation=1):
pooling_layers = {
1: torch.nn.AvgPool1d,
2: torch.nn.AvgPool2d,
3: torch.nn.AvgPool3d,
}
ndim = len(x.shape) - 2
if ndim not in pooling_layers:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
if ndim == 3 and x.dtype == torch.float16:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
x.to(torch.float32)
).to(torch.float16)
else:
ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
if PROFILE:
torch.cuda.synchronize()
return ans
def inferShape(x_shape, kernel_shape, padding, strides):
assert (
len(x_shape) - 2 == len(kernel_shape) == len(padding) == len(strides)
), "kernel, pads, and strides should have the same length; the length of input x should be 2 more than that of kernel"
input_shape = x_shape[2:]
output_shape = []
for dim, k, p, s in zip(input_shape, kernel_shape, padding, strides):
output_dim = (dim + 2 * p - k) // s + 1
output_shape.append(output_dim)
return x_shape[:2] + tuple(output_shape)
# convert a python tuple to a ctype void pointer
def tuple_to_void_p(py_tuple: Tuple):
array = ctypes.c_int64 * len(py_tuple)
data_array = array(*py_tuple)
return ctypes.cast(data_array, ctypes.c_void_p)
def test(
lib,
handle,
torch_device,
x_shape,
k_shape,
padding,
strides,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing AvgPool on {torch_device} with x_shape:{x_shape} kernel_shape:{k_shape} padding:{padding} strides:{strides} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(
inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = pool(x, k_shape, padding, strides)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = pool(x, k_shape, padding, strides)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
tuple_to_void_p(k_shape),
tuple_to_void_p(padding),
tuple_to_void_p(strides),
len(k_shape),
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, kernel_shape, padding, strides in test_cases:
# fmt: off
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# x_shape, kernel_shape, padding, strides
((1, 1, 10), (3,), (1,), (1,)),
((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
infiniopAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopAvgPool.restype = c_int32
lib.infiniopAvgPool.argtypes = [
infiniopAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
infiniopAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch import torch
import ctypes import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float from ctypes import c_uint64
from libinfiniop import ( from libinfiniop import (
infiniopHandle_t, LIBINFINIOP,
infiniopTensorDescriptor_t, TestTensor,
open_lib,
to_tensor,
get_test_devices, get_test_devices,
check_error, check_error,
rearrange_if_needed,
create_workspace,
test_operator, test_operator,
get_args, get_args,
debug, debug,
get_tolerance, get_tolerance,
profile_operation, profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
) )
from enum import Enum, auto from enum import Enum, auto
...@@ -34,13 +35,13 @@ _TEST_CASES_ = [ ...@@ -34,13 +35,13 @@ _TEST_CASES_ = [
] ]
# Data types used for testing # Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.bfloat16, torch.float32] _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-2}, InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
torch.bfloat16: {"atol": 5e-3, "rtol": 5e-2}, InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
torch.float32: {"atol": 1e-5, "rtol": 1e-5}, InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
} }
...@@ -66,13 +67,6 @@ NUM_PRERUN = 10 ...@@ -66,13 +67,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000 NUM_ITERATIONS = 1000
class CausalSoftmaxDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopCausalSoftmaxDescriptor_t = POINTER(CausalSoftmaxDescriptor)
def causal_softmax(x): def causal_softmax(x):
type = x.dtype type = x.dtype
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1]) mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])
...@@ -81,66 +75,57 @@ def causal_softmax(x): ...@@ -81,66 +75,57 @@ def causal_softmax(x):
def test( def test(
lib,
handle, handle,
torch_device, device,
shape, shape,
x_stride=None, x_stride=None,
y_stride=None, y_stride=None,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16, dtype=InfiniDtype.F16,
sync=None, sync=None,
): ):
print( print(
f"Testing CausalSoftmax on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{dtype} inplace:{inplace}" f"Testing CausalSoftmax on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
) )
x = torch.rand(shape, dtype=dtype).to(torch_device) x = TestTensor(shape, x_stride, dtype, device)
mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1]) ans = causal_softmax(x.torch_tensor())
x = torch.where(mask == 1, torch.full_like(x, torch.finfo(x.dtype).max), x)
ans = causal_softmax(x)
x = rearrange_if_needed(x, x_stride)
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X: if inplace == Inplace.INPLACE_X:
y = x y = x
y_tensor = x_tensor
else: else:
y = torch.zeros(shape, dtype=dtype).to(torch_device) y = TestTensor(shape, x_stride, dtype, device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib)
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopCausalSoftmaxDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
check_error( check_error(
lib.infiniopCreateCausalSoftmaxDescriptor( LIBINFINIOP.infiniopCreateCausalSoftmaxDescriptor(
handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
) )
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.destroyDesc(lib) x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopGetCausalSoftmaxWorkspaceSize( LIBINFINIOP.infiniopGetCausalSoftmaxWorkspaceSize(
descriptor, ctypes.byref(workspace_size) descriptor, ctypes.byref(workspace_size)
) )
) )
workspace = create_workspace(workspace_size.value, x.device) workspace = TestWorkspace(workspace_size.value, x.device)
def lib_causal_softmax(): def lib_causal_softmax():
check_error( check_error(
lib.infiniopCausalSoftmax( LIBINFINIOP.infiniopCausalSoftmax(
descriptor, descriptor,
workspace.data_ptr() if workspace is not None else None, workspace.data(),
workspace_size.value, workspace_size.value,
y_tensor.data, y.data(),
x_tensor.data, x.data(),
None, None,
) )
) )
...@@ -152,49 +137,21 @@ def test( ...@@ -152,49 +137,21 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
debug(y, ans, atol=atol, rtol=rtol) debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol) assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
# fmt: off # fmt: off
profile_operation("PyTorch", lambda: causal_softmax(x), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_causal_softmax(), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
check_error(lib.infiniopDestroyCausalSoftmaxDescriptor(descriptor)) check_error(LIBINFINIOP.infiniopDestroyCausalSoftmaxDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopCausalSoftmaxDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopCausalSoftmaxDescriptor_t,
]
# Configure testing options # Configure testing options
DEBUG = args.debug DEBUG = args.debug
...@@ -203,6 +160,6 @@ if __name__ == "__main__": ...@@ -203,6 +160,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
...@@ -2,21 +2,22 @@ ...@@ -2,21 +2,22 @@
import torch import torch
import ctypes import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float from ctypes import c_uint64
from libinfiniop import ( from libinfiniop import (
infiniopHandle_t, LIBINFINIOP,
infiniopTensorDescriptor_t, TestTensor,
open_lib,
to_tensor,
get_test_devices, get_test_devices,
check_error, check_error,
rearrange_if_needed,
create_workspace,
test_operator, test_operator,
get_args, get_args,
debug, debug,
get_tolerance, get_tolerance,
profile_operation, profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
) )
from enum import Enum, auto from enum import Enum, auto
...@@ -51,12 +52,12 @@ _TEST_CASES_ = [ ...@@ -51,12 +52,12 @@ _TEST_CASES_ = [
] ]
_TENSOR_DTYPES = [torch.float16, torch.float32] _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 1e-3, "rtol": 1e-3}, InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
torch.float32: {"atol": 1e-7, "rtol": 1e-6}, InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-6},
} }
...@@ -82,156 +83,108 @@ NUM_PRERUN = 10 ...@@ -82,156 +83,108 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000 NUM_ITERATIONS = 1000
class ClipDescriptor(Structure): def clip(y, x, min_val, max_val):
_fields_ = [("device_type", c_int32), ("device_id", c_int32)] torch.clamp(x, min_val, max_val, out=y)
infiniopClipDescriptor_t = POINTER(ClipDescriptor)
def clip(x, min_val, max_val):
return torch.clamp(x, min_val, max_val)
def test( def test(
lib,
handle, handle,
torch_device, device,
shape, shape,
x_stride=None, x_stride=None,
y_stride=None, y_stride=None,
min_val=-1.0, min_val=-1.0,
max_val=1.0, max_val=1.0,
inplace=Inplace.OUT_OF_PLACE, inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float32, dtype=InfiniDtype.F32,
sync=None, sync=None,
): ):
print( x = TestTensor(shape, x_stride, dtype, device)
f"Testing Clip on {torch_device} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} " min_ = TestTensor(
f"min_val:{min_val} max_val:{max_val} dtype:{dtype} inplace:{inplace}" shape, [0 for _ in shape], dtype, device, mode="zeros", bias=min_val
) )
x = torch.rand(shape, dtype=dtype).to(torch_device) max_ = TestTensor(
ans = clip(x, min_val, max_val) shape, [0 for _ in shape], dtype, device, mode="zeros", bias=max_val
x = rearrange_if_needed(x, x_stride) )
x_tensor = to_tensor(x, lib)
if inplace == Inplace.INPLACE_X: if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x y = x
y_tensor = x_tensor
else: else:
y = torch.zeros(shape, dtype=dtype).to(torch_device) y = TestTensor(shape, y_stride, dtype, device)
y = rearrange_if_needed(y, y_stride)
y_tensor = to_tensor(y, lib) if y.is_broadcast():
return
print(
f"Testing Clip on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"min_val:{min_val} max_val:{max_val} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val)
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopClipDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
min_, max_ = torch.tensor([min_val], dtype=dtype).to(torch_device), torch.tensor(
[max_val], dtype=dtype
).to(torch_device)
min_tensor = to_tensor(
min_, lib, force_shape=shape, force_strides=[0 for _ in shape]
)
max_tensor = to_tensor(
max_, lib, force_shape=shape, force_strides=[0 for _ in shape]
)
check_error( check_error(
lib.infiniopCreateClipDescriptor( LIBINFINIOP.infiniopCreateClipDescriptor(
handle, handle,
ctypes.byref(descriptor), ctypes.byref(descriptor),
y_tensor.descriptor, y.descriptor,
x_tensor.descriptor, x.descriptor,
min_tensor.descriptor, min_.descriptor,
max_tensor.descriptor, max_.descriptor,
) )
) )
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopGetClipWorkspaceSize(descriptor, ctypes.byref(workspace_size)) LIBINFINIOP.infiniopGetClipWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
) )
workspace = create_workspace(workspace_size.value, x.device) workspace = TestWorkspace(workspace_size.value, x.device)
def lib_clip(): def lib_clip():
check_error( check_error(
lib.infiniopClip( LIBINFINIOP.infiniopClip(
descriptor, descriptor,
workspace.data_ptr() if workspace is not None else None, workspace.data() if workspace is not None else None,
workspace_size.value, workspace_size.value,
y_tensor.data, y.data(),
x_tensor.data, x.data(),
min_tensor.data, min_.data(),
max_tensor.data, max_.data(),
None, None,
) )
) )
lib_clip() lib_clip()
# Now we can destroy the tensor descriptors # Destroy the tensor descriptors
x_tensor.destroyDesc(lib) for tensor in [x, y, min_, max_]:
if inplace != Inplace.INPLACE_X: tensor.destroy_desc()
y_tensor.destroyDesc(lib)
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG or not torch.allclose(y, ans, atol=atol, rtol=rtol): if DEBUG:
print("\nExpected:") debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
print(ans) assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
print("\nActual:")
print(y)
print("\nDifference:")
print(torch.abs(y - ans))
print("\nMax difference:", torch.max(torch.abs(y - ans)).item())
debug(y, ans, atol=atol, rtol=rtol)
assert torch.allclose(y, ans, atol=atol, rtol=rtol)
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
# fmt: off # fmt: off
profile_operation("PyTorch", lambda: clip(x, min_val, max_val), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation("PyTorch", lambda: clip(y.torch_tensor(), x.torch_tensor(), min_val, max_val), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_clip(), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_clip(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
check_error(lib.infiniopDestroyClipDescriptor(descriptor)) check_error(LIBINFINIOP.infiniopDestroyClipDescriptor(descriptor))
if __name__ == "__main__": if __name__ == "__main__":
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopClipDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopClipDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopClipDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopClipDescriptor_t,
]
# Configure testing options # Configure testing options
DEBUG = args.debug DEBUG = args.debug
PROFILE = args.profile PROFILE = args.profile
...@@ -239,6 +192,6 @@ if __name__ == "__main__": ...@@ -239,6 +192,6 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
rearrange_tensor,
)
from operatorspy.tests.test_utils import get_args
import torch
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class ExpandDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopExpandDescriptor_t = POINTER(ExpandDescriptor)
def expand(x, y):
if PROFILE:
ans = x.expand_as(y).clone()
torch.cuda.synchronize()
return ans
return x.expand_as(y)
def test(
lib,
handle,
torch_device,
y_shape,
x_shape,
y_stride=None,
x_stride=None,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing Expand on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} x_stride:{x_stride} y_stride:{y_stride} dtype:{tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device)
if x_stride is not None:
x = rearrange_tensor(x, x_stride)
if y_stride is not None:
y = rearrange_tensor(y, y_stride)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = expand(x, y)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = expand(x, y)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopExpandDescriptor_t()
check_error(
lib.infiniopCreateExpandDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None))
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopExpand(descriptor, y_tensor.data, x_tensor.data, None)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyExpandDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for y_shape, x_shape, y_stride, x_stride in test_cases:
# fmt: off
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# fmt: off
# y_shape, x_shape, y_stride, x_stride
((), (), None, None),
((3, 3), (1,), None, None),
((5, 4, 3), (4, 3,), None, (6, 1)),
((99, 111), (111,), None, None),
((2, 4, 3), (1, 3), None, None),
((2, 20, 3), (2, 1, 3), None, None),
((2, 3, 4, 5), (5,), None, None),
((3, 2, 4, 5), (3, 2, 1, 1), None, None),
((32, 256, 112, 112), (32, 256, 112, 1), None, None),
# fmt: on
]
args = get_args()
lib = open_lib()
lib.infiniopCreateExpandDescriptor.restype = c_int32
lib.infiniopCreateExpandDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopExpandDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopExpand.restype = c_int32
lib.infiniopExpand.argtypes = [
infiniopExpandDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyExpandDescriptor.restype = c_int32
lib.infiniopDestroyExpandDescriptor.argtypes = [
infiniopExpandDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
import torch import torch
import ctypes import ctypes
from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float from ctypes import c_uint64
from libinfiniop import ( from libinfiniop import (
infiniopHandle_t, LIBINFINIOP,
infiniopTensorDescriptor_t, TestTensor,
open_lib,
to_tensor,
get_test_devices, get_test_devices,
check_error, check_error,
rearrange_if_needed,
create_workspace,
test_operator, test_operator,
get_args, get_args,
debug, debug,
get_tolerance, get_tolerance,
profile_operation, profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
) )
# ============================================================================== # ==============================================================================
...@@ -31,13 +32,13 @@ _TEST_CASES = [ ...@@ -31,13 +32,13 @@ _TEST_CASES = [
] ]
# Data types used for testing # Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32, torch.bfloat16] _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2}, InfiniDtype.F16: {"atol": 0, "rtol": 1e-2},
torch.float32: {"atol": 0, "rtol": 1e-3}, InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
torch.bfloat16: {"atol": 0, "rtol": 5e-2}, InfiniDtype.BF16: {"atol": 0, "rtol": 5e-2},
} }
DEBUG = False DEBUG = False
...@@ -46,16 +47,6 @@ NUM_PRERUN = 10 ...@@ -46,16 +47,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000 NUM_ITERATIONS = 1000
# ==============================================================================
# Definitions
# ==============================================================================
class GemmDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGemmDescriptor_t = POINTER(GemmDescriptor)
# PyTorch implementation for matrix multiplication # PyTorch implementation for matrix multiplication
def gemm(d, _c, beta, _a, _b, alpha): def gemm(d, _c, beta, _a, _b, alpha):
try: try:
...@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha): ...@@ -73,9 +64,8 @@ def gemm(d, _c, beta, _a, _b, alpha):
# The argument list should be (lib, handle, torch_device, <param list>, dtype) # The argument list should be (lib, handle, torch_device, <param list>, dtype)
# The <param list> should keep the same order as the one specified in _TEST_CASES # The <param list> should keep the same order as the one specified in _TEST_CASES
def test( def test(
lib,
handle, handle,
torch_device, device,
alpha, alpha,
beta, beta,
a_shape, a_shape,
...@@ -84,65 +74,71 @@ def test( ...@@ -84,65 +74,71 @@ def test(
a_stride=None, a_stride=None,
b_stride=None, b_stride=None,
c_stride=None, c_stride=None,
dtype=torch.float16, dtype=InfiniDtype.F16,
sync=None, sync=None,
): ):
print( print(
f"Testing Gemm on {torch_device} with alpha:{alpha}, beta:{beta}," f"Testing Gemm on {InfiniDeviceNames[device]} with alpha:{alpha}, beta:{beta},"
f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape}," f" a_shape:{a_shape}, b_shape:{b_shape}, c_shape:{c_shape},"
f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{dtype}" f" a_stride:{a_stride}, b_stride:{b_stride}, c_stride:{c_stride}, dtype:{InfiniDtypeNames[dtype]}"
) )
# Initialize tensors # Initialize tensors
a = torch.rand(a_shape, dtype=dtype).to(torch_device) a = TestTensor(a_shape, a_stride, dtype, device)
b = torch.rand(b_shape, dtype=dtype).to(torch_device) b = TestTensor(b_shape, b_stride, dtype, device)
c = torch.ones(c_shape, dtype=dtype).to(torch_device) c = TestTensor(c_shape, c_stride, dtype, device, mode="ones")
ans = torch.zeros(c_shape, dtype=dtype).to(torch_device) ans = TestTensor(c_shape, c_stride, dtype, device, mode="zeros")
# Compute the PyTorch reference result # Compute the PyTorch reference result
gemm(ans, c, beta, a, b, alpha) def torch_gemm():
gemm(
ans.torch_tensor(),
c.torch_tensor(),
beta,
a.torch_tensor(),
b.torch_tensor(),
alpha,
)
a, b, c = [ torch_gemm()
rearrange_if_needed(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
]
a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
if sync is not None: if sync is not None:
sync() sync()
descriptor = infiniopGemmDescriptor_t() descriptor = infiniopOperatorDescriptor_t()
check_error( check_error(
lib.infiniopCreateGemmDescriptor( LIBINFINIOP.infiniopCreateGemmDescriptor(
handle, handle,
ctypes.byref(descriptor), ctypes.byref(descriptor),
c_tensor.descriptor, c.descriptor,
a_tensor.descriptor, a.descriptor,
b_tensor.descriptor, b.descriptor,
) )
) )
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]: for tensor in [a, b, c]:
tensor.destroyDesc(lib) tensor.destroy_desc()
# Get workspace size and create workspace # Get workspace size and create workspace
workspace_size = c_uint64(0) workspace_size = c_uint64(0)
check_error( check_error(
lib.infiniopGetGemmWorkspaceSize(descriptor, ctypes.byref(workspace_size)) LIBINFINIOP.infiniopGetGemmWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
) )
workspace = create_workspace(workspace_size.value, a.device) workspace = TestWorkspace(workspace_size.value, device)
# Execute infiniop gemm operator # Execute infiniop gemm operator
def lib_gemm(): def lib_gemm():
check_error( check_error(
lib.infiniopGemm( LIBINFINIOP.infiniopGemm(
descriptor, descriptor,
workspace.data_ptr() if workspace is not None else None, workspace.data(),
workspace_size.value, workspace_size.value,
c_tensor.data, c.data(),
a_tensor.data, a.data(),
b_tensor.data, b.data(),
alpha, alpha,
beta, beta,
None, None,
...@@ -155,17 +151,17 @@ def test( ...@@ -155,17 +151,17 @@ def test(
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
debug(c, ans, atol=atol, rtol=rtol) debug(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol) assert torch.allclose(c.actual_tensor(), ans.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow # Profiling workflow
if PROFILE: if PROFILE:
# fmt: off # fmt: off
profile_operation("PyTorch", lambda: gemm(ans, c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation("PyTorch", lambda: torch_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_gemm(), torch_device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on # fmt: on
check_error(lib.infiniopDestroyGemmDescriptor(descriptor)) check_error(LIBINFINIOP.infiniopDestroyGemmDescriptor(descriptor))
# ============================================================================== # ==============================================================================
...@@ -173,40 +169,6 @@ def test( ...@@ -173,40 +169,6 @@ def test(
# ============================================================================== # ==============================================================================
if __name__ == "__main__": if __name__ == "__main__":
args = get_args() args = get_args()
lib = open_lib()
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGemmDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopGemmDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopGemmDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopGemmDescriptor_t,
]
# Configure testing options # Configure testing options
DEBUG = args.debug DEBUG = args.debug
...@@ -216,6 +178,6 @@ if __name__ == "__main__": ...@@ -216,6 +178,6 @@ if __name__ == "__main__":
# Execute tests # Execute tests
for device in get_test_devices(args): for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES) test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m") print("\033[92mTest passed!\033[0m")
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
import ctypes
import sys
import os
import time
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
check_error,
)
from operatorspy.tests.test_utils import get_args
import torch, time
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
# e.g., cudaDeviceSynchronize() for CUDA
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class GlobalAvgPoolDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopGlobalAvgPoolDescriptor_t = POINTER(GlobalAvgPoolDescriptor)
def inferShape(x):
return x.shape[:2] + (1,) * (x.dim() - 2)
def globalAvgPool(x):
y = torch.mean(x, dim=tuple(range(2, x.dim())), keepdim=True)
if PROFILE:
torch.cuda.synchronize()
return y.view(*inferShape(x))
def test(
lib,
handle,
torch_device,
x_shape,
tensor_dtype=torch.float16,
sync=None
):
print(
f"Testing GlobalAvgPool on {torch_device} with input tensor_shape: {x_shape} dtype: {tensor_dtype}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(inferShape(x), dtype=tensor_dtype).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = globalAvgPool(x)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = globalAvgPool(x)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
y_tensor = to_tensor(y, lib)
if sync is not None:
sync()
descriptor = infiniopGlobalAvgPoolDescriptor_t()
check_error(
lib.infiniopCreateGlobalAvgPoolDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
workspaceSize = ctypes.c_uint64(0)
check_error(
lib.infiniopGetGlobalAvgPoolWorkspaceSize(
descriptor, ctypes.byref(workspaceSize)
)
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
for i in range(NUM_PRERUN if PROFILE else 1):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopGlobalAvgPool(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyGlobalAvgPoolDescriptor(descriptor))
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape in test_cases:
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, tensor_dtype=torch.float32)
destroy_handle(lib, handle)
if __name__ == "__main__":
test_cases = [
# x_shape
((1, 3, 3)),
((1, 3, 1, 1, 3)),
((1, 3, 1, 1, 257)),
((1, 2, 1, 1, 514)),
((1, 3, 1, 1, 1025)),
((32, 256, 1, 112, 112)),
((2, 3, 2048000)),
((2, 1, 10243)),
((2, 20, 100)),
((3, 33, 333)),
((32, 20, 512)),
((3, 3, 11, 11, 11, 3, 2)),
((32, 256, 1, 112, 112)),
((32, 256, 112, 112)),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopCreateGlobalAvgPoolDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopGlobalAvgPoolDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGlobalAvgPoolWorkspaceSize.restype = c_int32
lib.infiniopGetGlobalAvgPoolWorkspaceSize.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopGlobalAvgPool.restype = c_int32
lib.infiniopGlobalAvgPool.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyGlobalAvgPoolDescriptor.restype = c_int32
lib.infiniopDestroyGlobalAvgPoolDescriptor.argtypes = [
infiniopGlobalAvgPoolDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
...@@ -4,10 +4,11 @@ import sys ...@@ -4,10 +4,11 @@ import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "."))) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
from .liboperators import ( from .liboperators import (
open_lib, open_lib,
CTensor,
infiniopHandle_t, infiniopHandle_t,
infiniopTensorDescriptor_t, infiniopTensorDescriptor_t,
LIBINFINIOP,
) )
from .devices import * from .devices import *
from .utils import * from .utils import *
from .datatypes import * from .datatypes import *
from .structs import *
...@@ -19,3 +19,27 @@ class InfiniDtype: ...@@ -19,3 +19,27 @@ class InfiniDtype:
C32 = 17 C32 = 17
C64 = 18 C64 = 18
BF16 = 19 BF16 = 19
InfiniDtypeNames = {
InfiniDtype.INVALID: "INVALID",
InfiniDtype.BYTE: "BYTE",
InfiniDtype.BOOL: "BOOL",
InfiniDtype.I8: "I8",
InfiniDtype.I16: "I16",
InfiniDtype.I32: "I32",
InfiniDtype.I64: "I64",
InfiniDtype.U8: "U8",
InfiniDtype.U16: "U16",
InfiniDtype.U32: "U32",
InfiniDtype.U64: "U64",
InfiniDtype.F8: "F8",
InfiniDtype.F16: "F16",
InfiniDtype.F32: "F32",
InfiniDtype.F64: "F64",
InfiniDtype.C8: "C8",
InfiniDtype.C16: "C16",
InfiniDtype.C32: "C32",
InfiniDtype.C64: "C64",
InfiniDtype.BF16: "BF16",
}
...@@ -10,8 +10,20 @@ class InfiniDeviceEnum: ...@@ -10,8 +10,20 @@ class InfiniDeviceEnum:
SUGON = 8 SUGON = 8
InfiniDeviceNames = {
InfiniDeviceEnum.CPU: "CPU",
InfiniDeviceEnum.NVIDIA: "NVIDIA",
InfiniDeviceEnum.CAMBRICON: "Cambricon",
InfiniDeviceEnum.ASCEND: "Ascend",
InfiniDeviceEnum.METAX: "Metax",
InfiniDeviceEnum.MOORE: "Moore",
InfiniDeviceEnum.ILUVATAR: "Iluvatar",
InfiniDeviceEnum.KUNLUN: "Kunlun",
InfiniDeviceEnum.SUGON: "Sugon",
}
# Mapping that maps InfiniDeviceEnum to torch device string # Mapping that maps InfiniDeviceEnum to torch device string
infiniDeviceEnum_str_map = { torch_device_map = {
InfiniDeviceEnum.CPU: "cpu", InfiniDeviceEnum.CPU: "cpu",
InfiniDeviceEnum.NVIDIA: "cuda", InfiniDeviceEnum.NVIDIA: "cuda",
InfiniDeviceEnum.CAMBRICON: "mlu", InfiniDeviceEnum.CAMBRICON: "mlu",
......
import os import os
import platform import platform
import ctypes import ctypes
from ctypes import c_int, c_int64, c_uint64, Structure, POINTER from ctypes import c_int, c_int64, c_uint64, POINTER
from .datatypes import * from .datatypes import *
from .devices import * from .devices import *
from .op_register import OpRegister
from pathlib import Path from pathlib import Path
from .structs import *
Device = c_int
Optype = c_int
INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini") INFINI_ROOT = os.getenv("INFINI_ROOT") or str(Path.home() / ".infini")
class TensorDescriptor(Structure):
_fields_ = []
infiniopTensorDescriptor_t = ctypes.POINTER(TensorDescriptor)
class CTensor:
def __init__(self, desc, torch_tensor):
self.descriptor = desc
self.torch_tensor_ = torch_tensor
self.data = torch_tensor.data_ptr()
def destroyDesc(self, lib_):
lib_.infiniopDestroyTensorDescriptor(self.descriptor)
self.descriptor = None
class Handle(Structure):
_fields_ = [("device", c_int), ("device_id", c_int)]
infiniopHandle_t = POINTER(Handle)
class InfiniLib: class InfiniLib:
def __init__(self, librt, libop): def __init__(self, librt, libop):
self.librt = librt self.librt = librt
...@@ -98,4 +72,9 @@ def open_lib(): ...@@ -98,4 +72,9 @@ def open_lib():
lib.infinirtSetDevice.argtypes = [c_int, c_int] lib.infinirtSetDevice.argtypes = [c_int, c_int]
lib.infinirtSetDevice.restype = c_int lib.infinirtSetDevice.restype = c_int
OpRegister.register_lib(lib)
return lib return lib
LIBINFINIOP = open_lib()
from .structs import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
infiniopOperatorDescriptor_t,
)
from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
class OpRegister:
registry = []
@classmethod
def operator(cls, op):
cls.registry.append(op)
return op
@classmethod
def register_lib(cls, lib):
for op in cls.registry:
op(lib)
@OpRegister.operator
def add_(lib):
lib.infiniopCreateAddDescriptor.restype = c_int32
lib.infiniopCreateAddDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetAddWorkspaceSize.restype = c_int32
lib.infiniopGetAddWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAdd.restype = c_int32
lib.infiniopAdd.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAddDescriptor.restype = c_int32
lib.infiniopDestroyAddDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def attention_(lib):
lib.infiniopCreateAttentionDescriptor.restype = c_int32
lib.infiniopCreateAttentionDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_size_t,
]
lib.infiniopGetAttentionWorkspaceSize.restype = c_int32
lib.infiniopGetAttentionWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopAttention.restype = c_int32
lib.infiniopAttention.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyAttentionDescriptor.restype = c_int32
lib.infiniopDestroyAttentionDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def causal_softmax_(lib):
lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopCreateCausalSoftmaxDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetCausalSoftmaxWorkspaceSize.restype = c_int32
lib.infiniopGetCausalSoftmaxWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopCausalSoftmax.restype = c_int32
lib.infiniopCausalSoftmax.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
]
lib.infiniopDestroyCausalSoftmaxDescriptor.restype = c_int32
lib.infiniopDestroyCausalSoftmaxDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def clip_(lib):
lib.infiniopCreateClipDescriptor.restype = c_int32
lib.infiniopCreateClipDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetClipWorkspaceSize.restype = c_int32
lib.infiniopGetClipWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopClip.restype = c_int32
lib.infiniopClip.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyClipDescriptor.restype = c_int32
lib.infiniopDestroyClipDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def conv_(lib):
pass
@OpRegister.operator
def gemm_(lib):
lib.infiniopCreateGemmDescriptor.restype = c_int32
lib.infiniopCreateGemmDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetGemmWorkspaceSize.restype = c_int32
lib.infiniopGetGemmWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopGemm.restype = c_int32
lib.infiniopGemm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_float,
c_float,
c_void_p,
]
lib.infiniopDestroyGemmDescriptor.restype = c_int32
lib.infiniopDestroyGemmDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def mul_(lib):
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def random_sample_(lib):
lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
lib.infiniopCreateRandomSampleDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
]
lib.infiniopGetRandomSampleWorkspaceSize.restype = c_int32
lib.infiniopGetRandomSampleWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRandomSample.restype = c_int32
lib.infiniopRandomSample.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_size_t,
c_void_p,
c_float,
c_float,
c_int32,
c_float,
c_void_p,
]
lib.infiniopDestroyRandomSampleDescriptor.restype = c_int32
lib.infiniopDestroyRandomSampleDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rearrange_(lib):
lib.infiniopCreateRearrangeDescriptor.restype = c_int32
lib.infiniopCreateRearrangeDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopRearrange.restype = c_int32
lib.infiniopRearrange.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRearrangeDescriptor.restype = c_int32
lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def rms_norm_(lib):
lib.infiniopCreateRMSNormDescriptor.restype = c_int32
lib.infiniopCreateRMSNormDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_float,
]
lib.infiniopGetRMSNormWorkspaceSize.restype = c_int32
lib.infiniopGetRMSNormWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRMSNorm.restype = c_int32
lib.infiniopRMSNorm.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRMSNormDescriptor.restype = c_int32
lib.infiniopDestroyRMSNormDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def rope_(lib):
lib.infiniopCreateRoPEDescriptor.restype = c_int32
lib.infiniopCreateRoPEDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetRoPEWorkspaceSize.restype = c_int32
lib.infiniopGetRoPEWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopRoPE.restype = c_int32
lib.infiniopRoPE.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyRoPEDescriptor.restype = c_int32
lib.infiniopDestroyRoPEDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def sub_(lib):
lib.infiniopCreateSubDescriptor.restype = c_int32
lib.infiniopCreateSubDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSubWorkspaceSize.restype = c_int32
lib.infiniopGetSubWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSub.restype = c_int32
lib.infiniopSub.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySubDescriptor.restype = c_int32
lib.infiniopDestroySubDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def swiglu_(lib):
lib.infiniopCreateSwiGLUDescriptor.restype = c_int32
lib.infiniopCreateSwiGLUDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetSwiGLUWorkspaceSize.restype = c_int32
lib.infiniopGetSwiGLUWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopSwiGLU.restype = c_int32
lib.infiniopSwiGLU.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroySwiGLUDescriptor.restype = c_int32
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment