Commit 975559ee authored by Graylatzhou's avatar Graylatzhou
Browse files

Issue/183 Mul算子CPU&CUDA

parent d54ee0fb
......@@ -19,5 +19,6 @@
#include "infiniop/ops/rope.h"
#include "infiniop/ops/swiglu.h"
#include "infiniop/tensor_descriptor.h"
#include "infiniop/ops/mul.h"
#endif // __INFINIOP_API_H__
#ifndef __INFINIOP_MUL_API_H__
#define __INFINIOP_MUL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
#endif
\ No newline at end of file
......@@ -7,6 +7,7 @@
*/
DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(mul)
#define REGISTER_INFINIOP_TEST(name) \
{ \
......@@ -24,6 +25,7 @@ DECLARE_INFINIOP_TEST(random_sample)
{ \
REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(mul) \
}
namespace infiniop_test {
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::mul {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopMulDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopMul(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
infiniopMul(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
(warm_ups + 1) / 2, (iterations + 1) / 2);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
// MUL 操作不需要特殊属性(与 GEMM 不同,GEMM 需要 alpha 和 beta)
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::mul
\ No newline at end of file
......@@ -359,7 +359,8 @@ private:
d_input_shapes, d_input_strides, stream));
dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX())));
size_t step = gridDims.x * blockDims.x;
for (size_t i = 0; i < output_size; i += step) {
......
#include "mul_cpu.h"
namespace op::mul::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch(_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<MulOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<MulOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<MulOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
}
#ifndef __MUL_CPU_H__
#define __MUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(mul, cpu)
namespace op::mul::cpu {
typedef struct MulOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
T operator()(const T &a, const T &b) const {
return a * b;
}
} MulOp;
} // namespace op::mul::cpu
#endif // __MUL_CPU_H__
\ No newline at end of file
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
namespace op::mul::cuda {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::cuda
\ No newline at end of file
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(mul, cuda)
#endif // __MUL_CUDA_API_H__
\ No newline at end of file
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace op::mul::cuda {
typedef struct MulOp {
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmul2(a, b);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(a, b);
} else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(a, b);
} else {
return a * b;
}
}
} MulOp;
} // namespace op::add::cuda
#endif // __MUL_CUDA_H__
\ No newline at end of file
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/mul.h"
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#endif
__C infiniStatus_t infiniopCreateMulDescriptor(
infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopMul(
infiniopMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
\ No newline at end of file
from ast import List
import numpy as np
import gguf
from typing import List
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
def mul(
a: np.ndarray,
b: np.ndarray,
c: np.ndarray = None,
):
if c is None:
return np.multiply(a, b)
return np.multiply(a, b, out=c)
def random_tensor(shape, dtype):
rate = 1e-3
var = 0.5 * rate # 数值范围在[-5e-4, 5e-4]
return rate * np.random.rand(*shape).astype(dtype) - var
class MulTestCase(InfiniopTestCase):
def __init__(
self,
a: np.ndarray,
stride_a: List[int] | None,
b: np.ndarray,
stride_b: List[int] | None,
c: np.ndarray,
stride_c: List[int] | None,
):
super().__init__("mul")
self.a = a
self.stride_a = stride_a
self.b = b
self.stride_b = stride_b
self.c = c
self.stride_c = stride_c
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.stride_a is not None:
test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
if self.stride_b is not None:
test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
if self.stride_c is not None:
test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
test_writer.add_tensor(
test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
)
ans = mul(self.a, self.b, self.c)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
)
if __name__ == '__main__':
test_writer = InfiniopTestWriter("mul.gguf")
test_cases = [
MulTestCase(
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
random_tensor((2, 3), np.float32),
gguf_strides(3, 1),
),
MulTestCase(
random_tensor((2, 3), np.float16),
gguf_strides(3, 1),
random_tensor((2, 3), np.float16),
gguf_strides(3, 1),
random_tensor((2, 3), np.float16),
gguf_strides(3, 1),
),
MulTestCase(
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
random_tensor((2, 3), np.float64),
gguf_strides(3, 1),
),
MulTestCase(
random_tensor((4, 6), np.float16),
gguf_strides(1, 4),
random_tensor((4, 6), np.float16),
gguf_strides(1, 5),
random_tensor((4, 6), np.float16),
gguf_strides(1, 4),
),
MulTestCase(
random_tensor((1, 2048), np.float16),
gguf_strides(1, 2048),
random_tensor((1, 2048), np.float16),
gguf_strides(1, 2048),
random_tensor((1, 2048), np.float16),
gguf_strides(1, 2048),
),
MulTestCase(
random_tensor((2048, 2048), np.float32),
None,
random_tensor((2048, 2048), np.float32),
None,
random_tensor((2048, 2048), np.float32),
None,
),
MulTestCase(
random_tensor((2, 4, 2048), np.float16),
None,
random_tensor((2, 4, 2048), np.float16),
None,
random_tensor((2, 4, 2048), np.float16),
None,
),
MulTestCase(
random_tensor((2, 4, 2048), np.float32),
None,
random_tensor((2, 4, 2048), np.float32),
None,
random_tensor((2, 4, 2048), np.float32),
None,
),
MulTestCase(
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2560),
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2560),
random_tensor((2048, 2560), np.float32),
gguf_strides(1, 2560),
),
MulTestCase(
random_tensor((4, 48, 64), np.float16),
None,
random_tensor((4, 48, 64), np.float16),
None,
random_tensor((4, 48, 64), np.float16),
None
),
MulTestCase(
random_tensor((4, 48, 64), np.float32),
None,
random_tensor((4, 48, 64), np.float32),
None,
random_tensor((4, 48, 64), np.float32),
None
),
]
test_writer.add_tests(test_cases)
test_writer.save()
import torch
import ctypes
from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
from libinfiniop import (
infiniopHandle_t,
infiniopTensorDescriptor_t,
open_lib,
to_tensor,
get_test_devices,
check_error,
rearrange_if_needed,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
create_workspace,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
torch.float16: {"atol": 1e-4, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class MulDescriptor(Structure):
_fields_ = [("device", c_int32)]
infiniopMulDescriptor_t = POINTER(MulDescriptor)
def mul(x, y):
return torch.mul(x, y)
def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
"""
rearrange the tensors if needed and apply the inplace config.
if inplace is true and the output (i.e., c) is placed to the broadcasted input,
the inplace config is ignored and out-of-place is used
"""
original_c_strides = c_strides if c_strides else c.stride()
def _rearrange(tensor, strides):
if strides and 0 in strides:
tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
return tensor
else:
return rearrange_if_needed(tensor, strides)
a, b, c = [
_rearrange(tensor, stride)
for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
]
c = (
c
if inplace == Inplace.OUT_OF_PLACE
else (a if inplace == Inplace.INPLACE_A else b)
)
# if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
if 0 in c.stride():
c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
return a, b, c
def test(
lib,
handle,
torch_device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
print(
f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"dtype:{dtype} inplace:{inplace}"
)
a = torch.rand(shape, dtype=dtype).to(torch_device)
b = torch.rand(shape, dtype=dtype).to(torch_device)
c = torch.rand(shape, dtype=dtype).to(torch_device)
a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
ans = mul(a, b)
a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
c_tensor = (
to_tensor(c, lib)
if inplace == Inplace.OUT_OF_PLACE
else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
)
if sync is not None:
sync()
descriptor = infiniopMulDescriptor_t()
check_error(
lib.infiniopCreateMulDescriptor(
handle,
ctypes.byref(descriptor),
c_tensor.descriptor,
a_tensor.descriptor,
b_tensor.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.destroyDesc(lib)
workspace_size = c_uint64(0)
check_error(
lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace = create_workspace(workspace_size.value, c.device)
def lib_mul():
check_error(
lib.infiniopMul(
descriptor,
workspace.data_ptr() if workspace is not None else None,
workspace_size.value,
c_tensor.data,
a_tensor.data,
b_tensor.data,
None,
)
)
lib_mul()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(c, ans, atol=atol, rtol=rtol)
assert torch.allclose(c, ans, atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(lib.infiniopDestroyMulDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
lib = open_lib()
lib.infiniopCreateMulDescriptor.restype = c_int32
lib.infiniopCreateMulDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopMulDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetMulWorkspaceSize.restype = c_int32
lib.infiniopGetMulWorkspaceSize.argtypes = [
infiniopMulDescriptor_t,
POINTER(c_uint64),
]
lib.infiniopMul.restype = c_int32
lib.infiniopMul.argtypes = [
infiniopMulDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyMulDescriptor.restype = c_int32
lib.infiniopDestroyMulDescriptor.argtypes = [
infiniopMulDescriptor_t,
]
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment