Unverified Commit 93191613 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1075 from InfiniTensor/RevertT_1-1-4

Revert T1-1-4
parents 6ab911c3 def22a08
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "var_mean_moore.h"
namespace op::var_mean::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t mean_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
bool IsNanOut(const VarMeanInfo &info) {
return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
}
template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
infiniStatus_t launchKernel(
const VarMeanInfo &info,
Tdata *var_output, Tdata *mean_output, const Tdata *input,
bool unbiased, bool keepdim,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
bool is_nan = IsNanOut(info);
if (info.reduce_num == input_size) { // scalar output
ComputeType *tmp_buffer;
constexpr size_t MAX_GRID_SIZE = 128;
size_t grid_size = std::min(MAX_GRID_SIZE,
(input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
CHECK_MOORE(musaFree(tmp_buffer));
} else {
size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, input_ndim, output_size, reduce_num,
permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *var_output,
void *mean_output,
const void *input,
bool unbiased,
bool keepdim,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, __mt_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::var_mean::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "var_mean_nvidia.cuh"
namespace op::var_mean::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t mean_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
bool IsNanOut(const VarMeanInfo &info) {
return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
}
template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
infiniStatus_t launchKernel(
const VarMeanInfo &info,
Tdata *var_output, Tdata *mean_output, const Tdata *input,
bool unbiased, bool keepdim,
cudaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(size_t);
ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(ptrdiff_t);
CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
bool is_nan = IsNanOut(info);
if (info.reduce_num == input_size) { // scalar output
ComputeType *tmp_buffer;
constexpr size_t MAX_GRID_SIZE = 128;
size_t grid_size = std::min(MAX_GRID_SIZE,
(input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
CHECK_CUDA(cudaFree(tmp_buffer));
} else {
size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, input_ndim, output_size, reduce_num,
permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *var_output,
void *mean_output,
const void *input,
bool unbiased,
bool keepdim,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, __nv_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::var_mean::nvidia
#ifndef __VAR_MEAN_NVIDIA_H__
#define __VAR_MEAN_NVIDIA_H__
#include "../var_mean_desc.h"
DESCRIPTOR(nvidia);
#endif // __VAR_MEAN_NVIDIA_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/var_mean.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/var_mean_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/var_mean_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/var_mean_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/var_mean_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/var_mean_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateVarMeanDescriptor(
infiniopHandle_t handle,
infiniopVarMeanDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t mean_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::var_mean::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::var_mean::NAMESPACE::Descriptor **>(desc_ptr), \
var_output_desc, \
mean_output_desc, \
input_desc, \
dim, \
dim_size, \
unbiased, \
keepdim)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::var_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopVarMean(
infiniopVarMeanDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *var_output,
void *mean_output,
const void *input,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, var_output, mean_output, input, unbiased, keepdim, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#ifndef INFINIOP_VAR_MEAN_DESCRIPTOR_H_
#define INFINIOP_VAR_MEAN_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::var_mean::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
VarMeanInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
VarMeanInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t var_output_desc, \
infiniopTensorDescriptor_t mean_output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool unbiased, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *var_output, \
void *mean_output, \
const void *input, \
bool unbiased, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
......@@ -13,22 +13,6 @@ struct CustomBFloat16 {
};
typedef struct CustomBFloat16 bf16_t;
inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
return lhs._v == rhs._v;
}
inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
return !(lhs == rhs);
}
inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
return lhs._v == rhs._v;
}
inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
return !(lhs == rhs);
}
float _f16_to_f32(fp16_t val);
fp16_t _f32_to_f16(float val);
......
......@@ -56,7 +56,7 @@ def parse_test_cases():
for data in _TEST_CASES_DATA:
shape, strides, dim, keepdim, out_strides = data
input_supports_inplace = not is_broadcast(strides)
# out_supports_inplace = not is_broadcast(out_strides)
out_supports_inplace = not is_broadcast(out_strides)
for dtype in _TENSOR_DTYPES:
tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
......@@ -81,19 +81,19 @@ def parse_test_cases():
)
# explicit out when supported (create out tensor with computed shape)
# out_shape = _compute_out_shape(shape, dim, keepdim)
# out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
# if out_supports_inplace:
# test_cases.append(
# TestCase(
# inputs=[in_spec],
# kwargs=kwargs,
# output_spec=out_spec,
# comparison_target="out",
# tolerance=tol,
# description="All - INPLACE(out)",
# )
# )
out_shape = _compute_out_shape(shape, dim, keepdim)
out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
if out_supports_inplace:
test_cases.append(
TestCase(
inputs=[in_spec],
kwargs=kwargs,
output_spec=out_spec,
comparison_target="out",
tolerance=tol,
description="All - INPLACE(out)",
)
)
return test_cases
......@@ -110,9 +110,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.all(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.all(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.all(*args, **kwargs)
def main():
......
......@@ -74,8 +74,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.avg_pool1d(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
def main():
......
......@@ -11,8 +11,6 @@ from framework.tensor import TensorInitializer
# Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None)
# infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean')
# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。
# 仍然保留原始配置,后续实现这些特性时只需放开过滤条件即可。
_TEST_CASES_DATA = [
((4, 5), (4,), None, False, None),
((8, 10), (8,), None, True, -1),
......@@ -22,9 +20,6 @@ _TEST_CASES_DATA = [
((2, 2), (2,), None, True, -100),
]
_SUPPORT_WEIGHT = False
_SUPPORT_IGNORE_INDEX = False
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
......@@ -45,11 +40,6 @@ def parse_test_cases():
) in _TEST_CASES_DATA:
for dtype in _TENSOR_DTYPES:
tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
if weight_present and not _SUPPORT_WEIGHT:
continue
if ignore_index is not None and not _SUPPORT_IGNORE_INDEX:
continue
logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype)
target = TensorSpec.from_tensor(
target_shape,
......@@ -61,7 +51,7 @@ def parse_test_cases():
)
inputs = [logits, target]
kwargs = {"reduction": "none"}
kwargs = {}
if weight_present:
weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype)
inputs.append(weight_spec)
......@@ -94,10 +84,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.cross_entropy(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation."""
out = kwargs.pop("out", None)
return infinicore.cross_entropy(*args, out=out, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.cross_entropy(*args, **kwargs)
def main():
......
......@@ -74,11 +74,8 @@ def parse_test_cases():
)
)
# Equal 结果为 bool,无法安全复用浮点/整型输入作为输出缓冲区。
# 只有当输入 dtype 本身为 bool 时才允许 inplace,这里提前留出开关。
allow_input_inplace = dtype == infinicore.bool
if allow_input_inplace and a_supports_inplace:
# in-place a
if a_supports_inplace:
test_cases.append(
TestCase(
inputs=[a_spec, b_spec],
......@@ -90,7 +87,8 @@ def parse_test_cases():
)
)
if allow_input_inplace and b_supports_inplace:
# in-place b
if b_supports_inplace:
test_cases.append(
TestCase(
inputs=[a_spec, b_spec],
......@@ -117,8 +115,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.eq(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.equal(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.eq(*args, **kwargs)
def main():
......
......@@ -70,8 +70,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.hardswish(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.nn.functional.hardswish(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.hardswish(*args, **kwargs)
def main():
......
......@@ -17,6 +17,7 @@ from framework import (
_TEST_CASES_DATA = [
((13, 4), None, -1.0, 1.0),
((13, 4), (10, 1), -0.5, 0.5),
((8, 8, 8), None, -2.0, 2.0),
]
......@@ -86,11 +87,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.hardtanh(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation."""
import infinicore.nn.functional as F
return F.hardtanh(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.hardtanh(*args, **kwargs)
def main():
......
......@@ -20,7 +20,7 @@ _TEST_CASES_DATA = [
((8, 8), None, None, None, None),
((8, 8), (16, 1), 1, False, None),
((2, 3, 4), None, 0, True, None),
((1, 8), None, (0,), False, None), # tuple 导致 infini_list kwargs dim,[0]
((1, 8), None, (0,), False, None),
((16, 64), (128, 1), None, None, None),
((4, 5, 6), (60, 12, 2), 2, True, None),
]
......@@ -61,6 +61,7 @@ def parse_test_cases():
description="Sum - OUT_OF_PLACE",
)
)
return test_cases
......@@ -76,11 +77,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.sum(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.sum(
*args, **kwargs
) # todo 找到具体对应的 python/infinicore/ops/sum.py
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.sum(*args, **kwargs)
def main():
......
......@@ -15,7 +15,7 @@ from framework import (
# Test cases format: (shape, input_strides, k, dim, largest, sorted)
_TEST_CASES_DATA = [
((6, 8), None, 1, 1, False, True),
((6, 8), None, 1, 1, True, True),
((8, 4), (16, 1), 2, 0, True, False),
((5, 5), None, 3, -1, False, True),
((3, 7), (14, 1), 2, 1, True, True),
......@@ -55,7 +55,6 @@ def parse_test_cases():
comparison_target=None,
tolerance=tol,
description=f"topk - OUT_OF_PLACE",
output_count=2,
)
)
......@@ -78,9 +77,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.topk(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.topk(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.topk(*args, **kwargs)
def main():
......
......@@ -76,9 +76,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.var(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.var(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.var(*args, **kwargs)
def main():
......
......@@ -15,7 +15,7 @@ from framework import (
# Test cases format: (in_shape, in_strides_or_None, dim_or_None, unbiased_or_None, keepdim_or_None)
# var_mean returns (var, mean)
# Changed in torch version 2.0: Previously this argument was called unbiased and was a boolean with True corresponding to correction=1 and False being correction=0.
_TEST_CASES_DATA = [
((8, 8), None, None, None, None),
((8, 8), (16, 1), 1, True, False),
......@@ -27,7 +27,7 @@ _TEST_CASES_DATA = [
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
}
_TENSOR_DTYPES = [infinicore.float16, infinicore.float32]
......@@ -47,8 +47,6 @@ def parse_test_cases():
kwargs["dim"] = dim
if unbiased is not None:
kwargs["unbiased"] = unbiased
# Changed in version 2.0: Previously this argument was called unbiased and was a boolean with True
# corresponding to correction=1 and False being correction=0.
if keepdim is not None:
kwargs["keepdim"] = keepdim
......@@ -78,9 +76,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.var_mean(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation (operator not yet available)."""
return infinicore.var_mean(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.var_mean(*args, **kwargs)
def main():
......
import ctypes
from ctypes import c_uint64
import torch
from libinfiniop import (
LIBINFINIOP,
InfiniDeviceNames,
InfiniDtype,
InfiniDtypeNames,
TestTensor,
TestWorkspace,
check_error,
debug,
get_args,
get_test_devices,
get_tolerance,
infiniopOperatorDescriptor_t,
profile_operation,
test_operator,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
_TEST_CASES = [
# input_shape, x_stride, y_stride, kernel_size, stride, padding
((2, 3, 16), None, None, 3, None, 0),
((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2),
((2, 1, 32), None, (32, 16, 1), 2, 2, 0),
((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1),
((4, 6, 31), None, None, 4, 2, 1),
((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def _effective_stride(stride, kernel_size):
if stride in (None, 0):
return kernel_size
return stride
def _compute_output_shape(input_shape, kernel_size, stride, padding):
stride = _effective_stride(stride, kernel_size)
width = input_shape[2]
out_width = (width + 2 * padding - kernel_size) // stride + 1
return (input_shape[0], input_shape[1], out_width)
def avg_pool1d_ref(x, kernel_size, stride, padding):
stride = _effective_stride(stride, kernel_size)
out = torch.nn.functional.avg_pool1d(
x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding
)
return out.to(x.dtype)
def test(
handle,
device,
input_shape,
x_stride,
y_stride,
kernel_size,
stride,
padding,
dtype=InfiniDtype.F16,
sync=None,
):
stride_value = _effective_stride(stride, kernel_size)
out_shape = _compute_output_shape(
input_shape, kernel_size, stride_value, padding
)
print(
f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, "
f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, "
f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}"
)
x = TestTensor(input_shape, x_stride, dtype, device)
y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros")
ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateAvgPool1dDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
kernel_size,
stride_value,
padding,
)
)
# Invalidate descriptors in tensors after creation to make sure kernels read from arguments
x.destroy_desc()
y.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_avg_pool1d():
check_error(
LIBINFINIOP.infiniopAvgPool1d(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
None,
)
)
lib_avg_pool1d()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
if PROFILE:
# fmt: off
profile_operation(
"PyTorch",
lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
profile_operation(
" lib",
lambda: lib_avg_pool1d(),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
# ------------------------------------------------------------
# 用例配置
# ------------------------------------------------------------
_TEST_CASES_ = [
((2, 4, 10), None, None), # logits shape, x_stride, y_stride
((1, 128, 32000), None, None),
((4, 512, 1000), None, None),
]
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2},
InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
}
# ------------------------------------------------------------
# PyTorch 参考实现
# ------------------------------------------------------------
def cross_entropy_ref(logits, target):
vocab = logits.shape[-1]
logits_flat = logits.reshape(-1, vocab).float()
target_flat = target.reshape(-1).long()
loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none")
return loss.view(target.shape).to(logits.dtype)
def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None):
logits_shape = shape
label_shape = shape[:-1]
vocab = shape[-1]
print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}")
x = TestTensor(logits_shape, x_stride, dtype, device)
target = TestTensor(label_shape, None, InfiniDtype.I64, device)
# 生成有效标签
tgt = target.torch_tensor()
tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device))
target.actual_tensor().copy_(tgt)
reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor())
y = TestTensor(label_shape, y_stride, dtype, device)
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateCrossEntropyDescriptor(
handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor
)
)
for tensor in [x, y, target]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
workspace = TestWorkspace(workspace_size.value, x.device)
def run():
check_error(
LIBINFINIOP.infiniopCrossEntropy(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
target.data(),
None,
)
)
run()
if sync:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol)
check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
_TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
((13, 4), (0, 1), None, None),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool,内存大小不同)
class Inplace(Enum):
OUT_OF_PLACE = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# 测试的输入数据类型
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
# 容差设置 (对于 Bool 比较,通常要求完全匹配)
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 0, "rtol": 0},
InfiniDtype.F32: {"atol": 0, "rtol": 0},
InfiniDtype.BF16: {"atol": 0, "rtol": 0},
InfiniDtype.I32: {"atol": 0, "rtol": 0},
InfiniDtype.I64: {"atol": 0, "rtol": 0},
InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
# PyTorch 标准实现
def equal_func(c, a, b):
torch.eq(a, b, out=c)
def test(
handle,
device,
shape,
a_stride=None,
b_stride=None,
c_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
# 输入 Tensor 使用指定的 dtype (如 float16)
a = TestTensor(shape, a_stride, dtype, device)
b = TestTensor(shape, b_stride, dtype, device)
# [关键修改] 输出 Tensor 强制使用 Bool 类型
# 注意:这里 c_stride 如果是按字节计算的,对于 Bool 类型通常是 1 byte
c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device)
if c.is_broadcast():
return
print(
f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL"
)
# 运行 PyTorch 对照组
equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
# [关键修改] 调用 Equal 的 Create 函数
check_error(
LIBINFINIOP.infiniopCreateEqualDescriptor(
handle,
ctypes.byref(descriptor),
c.descriptor, # Output (Bool)
a.descriptor, # Input A
b.descriptor, # Input B
)
)
# Invalidate descriptors
for tensor in [a, b, c]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetEqualWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, c.device)
def lib_equal():
check_error(
LIBINFINIOP.infiniopEqual(
descriptor,
workspace.data(),
workspace.size(),
c.data(),
a.data(),
b.data(),
None,
)
)
lib_equal()
# 使用 Bool 类型的容差 (实际上就是全等)
atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL)
if DEBUG:
debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# 验证结果
assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# 复用相同的测试用例配置,因为 HardSwish 也是逐元素操作
_TEST_CASES_ = [
# shape, input_stride, output_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), None),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), (4, 0, 1), None),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE = auto()
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
_TOLERANCE_MAP = {
InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def test(
handle,
device,
shape,
input_stride=None,
output_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=torch.float16,
sync=None,
):
input = TestTensor(shape, input_stride, dtype, device)
if inplace == Inplace.INPLACE:
if input_stride != output_stride:
return
output = input
else:
output = TestTensor(shape, output_stride, dtype, device, mode="ones")
if output.is_broadcast():
return
print(
f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
new_output = torch.nn.functional.hardswish(input.torch_tensor())
output.update_torch_tensor(new_output)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateHardSwishDescriptor(
handle,
ctypes.byref(descriptor),
output.descriptor,
input.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [input, output]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, output.device)
def lib_hardswish():
check_error(
LIBINFINIOP.infiniopHardSwish(
descriptor,
workspace.data(),
workspace.size(),
output.data(),
input.data(),
None,
)
)
lib_hardswish()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(
output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment