Unverified Commit d417f967 authored by zhiwu zhou's avatar zhiwu zhou Committed by GitHub
Browse files

Issue/213 添加conv算子cpu/cuda实现

parent c6a3e4c7
......@@ -7,9 +7,10 @@ typedef struct InfiniopDescriptor *infiniopConvDescriptor_t;
__C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
infiniopConvDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t w,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
void *pads,
void *strides,
void *dilations,
......@@ -17,7 +18,7 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
__C __export infiniStatus_t infiniopGetConvWorkspaceSize(infiniopConvDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, size_t workspace_size, void *y, void const *x, void const *w, void *stream);
__C __export infiniStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *workspace, size_t workspace_size, void *y, const void *x, const void *w, const void *bias, void *stream);
__C __export infiniStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
......
#ifndef __CONV_H__
#define __CONV_H__
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::conv::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
infiniDtype_t _dtype; \
ConvInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
ConvInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dtype(dtype), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y, \
infiniopTensorDescriptor_t x, \
infiniopTensorDescriptor_t w, \
infiniopTensorDescriptor_t b, \
const void *pads, \
const void *strides, \
const void *dilations, \
size_t n); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
const void *w, \
const void *bias, \
void *stream) const; \
}; \
}
#endif // __CONV_H__
#include "conv_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
namespace op::conv::cpu {
inline size_t calculatePaddedInputSize(const ConvInfo &info) {
std::vector<size_t> shape(info.ndim() + 2);
shape[0] = info.batch();
shape[1] = info.in_channels();
for (size_t i = 0; i < info.ndim(); ++i) {
shape[i + 2] = info.input_dim(i);
}
return op::common_cpu::getPaddedSize(info.ndim() + 2, shape.data(), info.getPadsInfo());
}
inline size_t calculateOutputSize(const ConvInfo &info) {
size_t size = info.batch() * info.out_channels();
for (size_t i = 0; i < info.ndim(); ++i) {
size *= info.output_dim(i);
}
return size;
}
inline bool needsPadding(const ConvInfo &info) {
const size_t *pads = info.getPadsInfo();
for (size_t i = 0; i < info.ndim(); ++i) {
if (pads[i] > 0) {
return true;
}
}
return false;
}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
const void *pads,
const void *strides,
const void *dilations,
size_t n) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
auto result = ConvInfo::create(handle_, y_desc, x_desc, w_desc, b_desc,
pads, strides, dilations, n);
CHECK_RESULT(result);
size_t WorkSpaceSize = 0;
const ConvInfo &info = result.take();
if (needsPadding(info)) {
WorkSpaceSize += calculatePaddedInputSize(info) * infiniSizeOf(dtype);
}
if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) {
WorkSpaceSize += calculateOutputSize(info) * sizeof(float);
}
*desc_ptr = new Descriptor(
dtype, std::move(info), WorkSpaceSize,
nullptr,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata>
void fillPaddedInput(
const ConvInfo &info,
const Tdata *x,
const size_t *padded_x_shape,
Tdata *padded_x,
size_t x_index,
size_t padded_x_index,
size_t ndim) {
size_t x_shape_val;
if (ndim == 0) {
x_shape_val = info.batch();
} else if (ndim == 1) {
x_shape_val = info.in_channels();
} else {
x_shape_val = info.input_dim(ndim - 2);
}
const auto padded_x_shape_val = padded_x_shape[ndim];
const auto x_base_index = x_index * x_shape_val;
size_t pad_offset = 0;
if (ndim >= 2 && x_shape_val != padded_x_shape_val) {
pad_offset = info.pad_info(ndim - 2);
}
const auto padded_x_base_index = padded_x_index * padded_x_shape_val + pad_offset;
for (size_t i = 0; i < x_shape_val; ++i) {
if (ndim == info.ndim() + 2 - 1) {
padded_x[padded_x_base_index + i] = x[x_base_index + i];
} else {
fillPaddedInput(info, x, padded_x_shape, padded_x,
x_base_index + i, padded_x_base_index + i, ndim + 1);
}
}
}
template <typename Xdata, typename Ydata>
void _applyConv(
const ConvInfo &info,
Ydata *y,
const Xdata *x,
const Xdata *w,
const size_t *x_shape,
size_t x_index,
size_t w_index,
size_t y_index,
size_t ndim) {
size_t dim_size, kernel_size;
size_t dilation, stride;
if (ndim < 2) {
return;
} else {
dim_size = x_shape[ndim];
kernel_size = info.kernel_dim(ndim - 2);
dilation = info.dilation_info(ndim - 2);
stride = info.stride_info(ndim - 2);
}
if (stride == 0) {
std::cerr << "Error: stride cannot be zero." << std::endl;
}
const auto steps = (dim_size - dilation * (kernel_size - 1) - 1) / stride + 1;
x_index *= dim_size;
w_index *= kernel_size;
size_t y_stride;
if (ndim == 0) {
y_stride = info.out_channels();
} else if (ndim == 1) {
y_stride = 1;
} else {
y_stride = info.output_dim(ndim - 2);
}
y_index *= y_stride;
for (size_t i = 0; i < steps; ++i, ++y_index) {
for (size_t k = 0; k < kernel_size; ++k) {
const auto curr_x_index = x_index + i * stride + k * dilation;
const auto curr_w_index = w_index + k;
if (ndim == info.ndim() + 1) {
if constexpr (std::is_same<Xdata, fp16_t>::value || std::is_same<Xdata, bf16_t>::value) {
y[y_index] += utils::cast<float>(x[curr_x_index]) * utils::cast<float>(w[curr_w_index]);
} else {
y[y_index] += x[curr_x_index] * w[curr_w_index];
}
} else {
_applyConv(info, y, x, w, x_shape, curr_x_index, curr_w_index,
y_index, ndim + 1);
}
}
}
}
template <typename Xdata, typename Ydata>
void applyConv(
const ConvInfo &info,
Ydata *y,
const Xdata *x,
const Xdata *w,
const size_t *x_shape) {
const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info.batch());
const ptrdiff_t out_channels = static_cast<ptrdiff_t>(info.out_channels());
const ptrdiff_t total_iterations = batch_size * out_channels;
#pragma omp parallel for schedule(dynamic)
for (ptrdiff_t iter = 0; iter < total_iterations; ++iter) {
const ptrdiff_t i = iter / out_channels; // batch index
const ptrdiff_t j = iter % out_channels; // output channel index
const size_t y_index = static_cast<size_t>(i) * info.out_channels() + static_cast<size_t>(j);
// 内层循环:遍历输入通道
for (size_t k = 0; k < info.in_channels(); ++k) {
const size_t x_index = static_cast<size_t>(i) * info.in_channels() + k;
const size_t w_index = static_cast<size_t>(j) * info.in_channels() + k;
_applyConv(info, y, x, w, x_shape, x_index, w_index, y_index, 2);
}
}
}
template <typename Xdata, typename Ydata>
void _conv_cpu(
const ConvInfo &info,
void *workspace,
size_t workspace_size,
Ydata *y,
const Xdata *x,
const Xdata *w) {
if (needsPadding(info)) {
auto padded_x = reinterpret_cast<Xdata *>(workspace);
if constexpr (std::is_same<Xdata, fp16_t>::value) {
fp16_t zero_val = utils::cast<fp16_t>(0.0f);
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), zero_val);
} else if constexpr (std::is_same<Xdata, bf16_t>::value) {
bf16_t zero_val = utils::cast<bf16_t>(0.0f);
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), zero_val);
} else if constexpr (std::is_same<Xdata, float>::value) {
std::fill(padded_x, padded_x + calculatePaddedInputSize(info), 0.0f);
}
fillPaddedInput(info, x, info.getPaddedShape(), padded_x, 0, 0, 0);
applyConv(info, y, padded_x, w, info.getPaddedShape());
} else {
std::vector<size_t> shape(info.ndim() + 2);
shape[0] = info.batch();
shape[1] = info.in_channels();
for (size_t i = 0; i < info.ndim(); ++i) {
shape[i + 2] = info.input_dim(i);
}
applyConv(info, y, x, w, shape.data());
}
}
template <typename Tdata>
infiniStatus_t conv_cpu(
const ConvInfo &info,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias) {
auto y_ptr = reinterpret_cast<Tdata *>(y);
auto x_ptr = reinterpret_cast<const Tdata *>(x);
auto w_ptr = reinterpret_cast<const Tdata *>(w);
auto output_size = calculateOutputSize(info);
if constexpr (std::is_same<Tdata, float>::value) {
std::fill(y_ptr, y_ptr + output_size, 0.0f);
} else {
std::fill(y_ptr, y_ptr + output_size, static_cast<Tdata>(0));
}
_conv_cpu<Tdata, Tdata>(info, workspace, workspace_size, y_ptr, x_ptr, w_ptr);
if (bias != nullptr) {
auto bias_ptr = reinterpret_cast<const Tdata *>(bias);
#pragma omp parallel for
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(output_size); ++i) {
size_t channel_idx = (i / info.spatial_sizes()) % info.out_channels();
y_ptr[i] += bias_ptr[channel_idx];
}
}
return INFINI_STATUS_SUCCESS;
}
template <>
infiniStatus_t conv_cpu<fp16_t>(
const ConvInfo &info,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias) {
auto y_float = reinterpret_cast<float *>(workspace);
auto x_half = reinterpret_cast<const fp16_t *>(x);
auto w_half = reinterpret_cast<const fp16_t *>(w);
auto output_size = calculateOutputSize(info);
std::fill(y_float, y_float + output_size, 0.0f);
void *conv_workspace = y_float + output_size;
size_t conv_workspace_size = workspace_size - output_size * sizeof(float);
_conv_cpu<fp16_t, float>(info, conv_workspace, conv_workspace_size, y_float, x_half, w_half);
auto y_half = reinterpret_cast<fp16_t *>(y);
if (bias != nullptr) {
auto bias_half = reinterpret_cast<const fp16_t *>(bias);
#pragma omp parallel for
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(output_size); ++i) {
size_t channel_idx = (i / info.spatial_sizes()) % info.out_channels();
float bias_value = utils::cast<float>(bias_half[channel_idx]);
y_float[i] += bias_value;
y_half[i] = utils::cast<fp16_t>(y_float[i]);
}
} else {
#pragma omp parallel for
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(output_size); ++i) {
y_half[i] = utils::cast<fp16_t>(y_float[i]);
}
}
return INFINI_STATUS_SUCCESS;
}
template <>
infiniStatus_t conv_cpu<bf16_t>(
const ConvInfo &info,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias) {
auto y_float = reinterpret_cast<float *>(workspace);
auto x_half = reinterpret_cast<const bf16_t *>(x);
auto w_half = reinterpret_cast<const bf16_t *>(w);
auto output_size = calculateOutputSize(info);
std::fill(y_float, y_float + output_size, 0.0f);
void *conv_workspace = y_float + output_size;
size_t conv_workspace_size = workspace_size - output_size * sizeof(float);
_conv_cpu<bf16_t, float>(info, conv_workspace, conv_workspace_size, y_float, x_half, w_half);
auto y_half = reinterpret_cast<bf16_t *>(y);
if (bias != nullptr) {
auto bias_half = reinterpret_cast<const bf16_t *>(bias);
#pragma omp parallel for
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(output_size); ++i) {
size_t channel_idx = (i / info.spatial_sizes()) % info.out_channels();
float bias_value = utils::cast<float>(bias_half[channel_idx]);
y_float[i] += bias_value;
y_half[i] = utils::cast<bf16_t>(y_float[i]);
}
} else {
#pragma omp parallel for
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(output_size); ++i) {
y_half[i] = utils::cast<bf16_t>(y_float[i]);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return conv_cpu<fp16_t>(_info, workspace, workspace_size, y, x, w, bias);
case INFINI_DTYPE_F32:
return conv_cpu<float>(_info, workspace, workspace_size, y, x, w, bias);
case INFINI_DTYPE_BF16:
return conv_cpu<bf16_t>(_info, workspace, workspace_size, y, x, w, bias);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::conv::cpu
#ifndef __CONV_CPU_H__
#define __CONV_CPU_H__
#include "../conv.h"
DESCRIPTOR(cpu)
#endif // __CONV_CPU_H__
#include "../../../devices/cuda/cuda_handle.cuh"
#include "conv_cuda.cuh"
#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
do { \
if (desc_ptr) { \
destroy_func(desc_ptr); \
desc_ptr = nullptr; \
} \
} while (0)
#define CLEANUP_CUDNN_DESCRIPTORS() \
do { \
DESTROY_CUDNN_DESCRIPTOR(x_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(y_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(w_desc, cudnnDestroyFilterDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(b_desc, cudnnDestroyTensorDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(act_desc, cudnnDestroyActivationDescriptor); \
DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor); \
} while (0)
namespace op::conv::cuda {
struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal;
cudnnTensorDescriptor_t x_desc = nullptr;
cudnnTensorDescriptor_t y_desc = nullptr;
cudnnFilterDescriptor_t w_desc = nullptr;
cudnnTensorDescriptor_t b_desc = nullptr;
cudnnActivationDescriptor_t act_desc = nullptr;
cudnnConvolutionDescriptor_t conv_desc = nullptr;
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
size_t workspace_size = 0;
private:
Opaque(std::shared_ptr<device::cuda::Handle::Internal> internal_ptr)
: internal(internal_ptr) {}
void initializeDimensionArrays(const ConvInfo &info,
std::vector<int> &input_dims,
std::vector<int> &output_dims,
std::vector<int> &filter_dims,
std::vector<int> &input_strides,
std::vector<int> &output_strides) const {
bool is_1d_conv = (info.ndim() == 1);
int actual_tensor_ndim = is_1d_conv ? 4 : static_cast<int>(info.ndim() + 2);
input_dims[0] = static_cast<int>(info.batch());
input_dims[1] = static_cast<int>(info.in_channels());
output_dims[0] = static_cast<int>(info.batch());
output_dims[1] = static_cast<int>(info.out_channels());
filter_dims[0] = static_cast<int>(info.out_channels());
filter_dims[1] = static_cast<int>(info.in_channels());
if (is_1d_conv) {
input_dims[2] = 1;
input_dims[3] = static_cast<int>(info.input_dim(0));
output_dims[2] = 1;
output_dims[3] = static_cast<int>(info.output_dim(0));
filter_dims[2] = 1;
filter_dims[3] = static_cast<int>(info.kernel_dim(0));
} else {
for (size_t i = 0; i < info.ndim(); ++i) {
input_dims[i + 2] = static_cast<int>(info.input_dim(i));
output_dims[i + 2] = static_cast<int>(info.output_dim(i));
filter_dims[i + 2] = static_cast<int>(info.kernel_dim(i));
}
}
calculateStrides(input_dims, input_strides, actual_tensor_ndim);
calculateStrides(output_dims, output_strides, actual_tensor_ndim);
}
void initializeConvolutionParams(const ConvInfo &info,
std::vector<int> &pads,
std::vector<int> &strides,
std::vector<int> &dilations) const {
bool is_1d_conv = (info.ndim() == 1);
if (is_1d_conv) {
pads[0] = 0;
pads[1] = static_cast<int>(info.pad_info(0));
strides[0] = 1;
strides[1] = static_cast<int>(info.stride_info(0));
dilations[0] = 1;
dilations[1] = static_cast<int>(info.dilation_info(0));
} else {
for (size_t i = 0; i < info.ndim(); ++i) {
pads[i] = static_cast<int>(info.pad_info(i));
strides[i] = static_cast<int>(info.stride_info(i));
dilations[i] = static_cast<int>(info.dilation_info(i));
}
}
}
void calculateStrides(const std::vector<int> &dims,
std::vector<int> &strides,
int ndim) const {
strides[ndim - 1] = 1;
for (int d = ndim - 2; d >= 0; --d) {
strides[d] = strides[d + 1] * dims[d + 1];
}
}
infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
cudnnDataType_t &cudnn_data_type) const {
if (data_type == INFINI_DTYPE_F16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_F32) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
} else if (data_type == INFINI_DTYPE_BF16) {
cudnn_data_type = device::cuda::getCudnnDtype(data_type);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t createBasicDescriptors(const std::vector<int> &input_dims,
const std::vector<int> &output_dims,
const std::vector<int> &filter_dims,
cudnnDataType_t cudnn_data_type,
int actual_tensor_ndim) {
CHECK_CUDNN(cudnnCreateTensorDescriptor(&x_desc));
CHECK_CUDNN(cudnnCreateTensorDescriptor(&y_desc));
CHECK_CUDNN(cudnnCreateFilterDescriptor(&w_desc));
CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
x_desc, CUDNN_TENSOR_NCHW, cudnn_data_type,
actual_tensor_ndim, input_dims.data()));
CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
y_desc, CUDNN_TENSOR_NCHW, cudnn_data_type,
actual_tensor_ndim, output_dims.data()));
CHECK_CUDNN(cudnnSetFilterNdDescriptor(
w_desc, cudnn_data_type, CUDNN_TENSOR_NCHW,
actual_tensor_ndim, filter_dims.data()));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t createBiasDescriptors(const ConvInfo &info,
cudnnDataType_t cudnn_data_type,
int actual_tensor_ndim) {
if (info.bias_dims_size() == 0) {
b_desc = nullptr;
act_desc = nullptr;
return INFINI_STATUS_SUCCESS;
}
std::vector<int> bias_dims_arr(actual_tensor_ndim);
std::vector<int> bias_strides_arr(actual_tensor_ndim);
bias_dims_arr[0] = 1;
bias_dims_arr[1] = static_cast<int>(info.out_channels());
for (int i = 2; i < actual_tensor_ndim; ++i) {
bias_dims_arr[i] = 1;
}
if (actual_tensor_ndim == 4) {
bias_strides_arr[0] = static_cast<int>(info.out_channels());
bias_strides_arr[1] = 1;
bias_strides_arr[2] = 1;
bias_strides_arr[3] = 1;
} else {
calculateStrides(bias_dims_arr, bias_strides_arr, actual_tensor_ndim);
}
CHECK_CUDNN(cudnnCreateTensorDescriptor(&b_desc));
CHECK_CUDNN(cudnnSetTensorNdDescriptor(
b_desc, cudnn_data_type, bias_dims_arr.size(),
bias_dims_arr.data(), bias_strides_arr.data()));
CHECK_CUDNN(cudnnCreateActivationDescriptor(&act_desc));
CHECK_CUDNN(cudnnSetActivationDescriptor(
act_desc, CUDNN_ACTIVATION_IDENTITY, CUDNN_NOT_PROPAGATE_NAN, 0.0));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setupConvolutionDescriptor(const std::vector<int> &pads,
const std::vector<int> &strides,
const std::vector<int> &dilations,
int spatial_ndim,
cudnnDataType_t compute_type) {
CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(
conv_desc,
spatial_ndim,
pads.data(),
strides.data(),
dilations.data(),
CUDNN_CROSS_CORRELATION,
compute_type));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setupAlgorithmWithoutBias() {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
CHECK_STATUS(internal->useCudnn(
nullptr,
[&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
handle, x_desc, w_desc, conv_desc, y_desc,
algo, &workspace_size));
return INFINI_STATUS_SUCCESS;
}));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setupAlgorithmWithBias() {
int maxAlgoCount = 0;
CHECK_STATUS(internal->useCudnn(
nullptr,
[&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(handle, &maxAlgoCount));
return INFINI_STATUS_SUCCESS;
}));
if (maxAlgoCount <= 0) {
maxAlgoCount = 8;
}
std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results(maxAlgoCount);
int algoCounts = 0;
CHECK_STATUS(internal->useCudnn(
nullptr, [&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(
handle, x_desc, w_desc, conv_desc, y_desc,
maxAlgoCount, &algoCounts, perf_results.data()));
return INFINI_STATUS_SUCCESS;
}));
if (algoCounts < 1) {
return INFINI_STATUS_BAD_PARAM;
}
for (int i = 0; i < algoCounts; ++i) {
CHECK_STATUS(internal->useCudnn(
nullptr,
[&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
handle, x_desc, w_desc, conv_desc, y_desc,
perf_results[i].algo, &workspace_size));
return INFINI_STATUS_SUCCESS;
}));
algo = perf_results[i].algo;
break;
}
return INFINI_STATUS_SUCCESS;
}
public:
Opaque(Opaque &&other) noexcept
: internal(std::move(other.internal)),
x_desc(other.x_desc),
y_desc(other.y_desc),
w_desc(other.w_desc),
b_desc(other.b_desc),
act_desc(other.act_desc),
conv_desc(other.conv_desc),
algo(other.algo),
workspace_size(other.workspace_size) {
other.x_desc = nullptr;
other.y_desc = nullptr;
other.w_desc = nullptr;
other.b_desc = nullptr;
other.act_desc = nullptr;
other.conv_desc = nullptr;
other.workspace_size = 0;
}
~Opaque() {
CLEANUP_CUDNN_DESCRIPTORS();
}
infiniStatus_t initializeCudnnContext(ConvInfo &info,
infiniDtype_t data_type,
cudnnDataType_t compute_type) {
bool is_1d_conv = (info.ndim() == 1);
int actual_tensor_ndim = is_1d_conv ? 4 : static_cast<int>(info.ndim() + 2);
int spatial_ndim_for_conv_desc = is_1d_conv ? 2 : static_cast<int>(info.ndim());
std::vector<int> input_dims_arr(actual_tensor_ndim);
std::vector<int> output_dims_arr(actual_tensor_ndim);
std::vector<int> filter_dims_arr(actual_tensor_ndim);
std::vector<int> input_strides_arr(actual_tensor_ndim);
std::vector<int> output_strides_arr(actual_tensor_ndim);
initializeDimensionArrays(info, input_dims_arr, output_dims_arr,
filter_dims_arr, input_strides_arr, output_strides_arr);
std::vector<int> pads_arr(spatial_ndim_for_conv_desc);
std::vector<int> strides_arr(spatial_ndim_for_conv_desc);
std::vector<int> dilations_arr(spatial_ndim_for_conv_desc);
initializeConvolutionParams(info, pads_arr, strides_arr, dilations_arr);
cudnnDataType_t cudnn_data_type;
CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
CHECK_STATUS(createBasicDescriptors(input_dims_arr, output_dims_arr,
filter_dims_arr, cudnn_data_type, actual_tensor_ndim));
CHECK_STATUS(createBiasDescriptors(info, cudnn_data_type, actual_tensor_ndim));
CHECK_STATUS(setupConvolutionDescriptor(pads_arr, strides_arr, dilations_arr,
spatial_ndim_for_conv_desc, compute_type));
if (info.bias_dims_size() == 0) {
CHECK_STATUS(setupAlgorithmWithoutBias());
} else {
CHECK_STATUS(setupAlgorithmWithBias());
}
return INFINI_STATUS_SUCCESS;
}
static inline utils::Result<Opaque> create(
std::shared_ptr<device::cuda::Handle::Internal> internal_ptr,
ConvInfo &info,
infiniDtype_t data_type) {
Opaque opaque(internal_ptr);
auto status = opaque.initializeCudnnContext(info, data_type, CUDNN_DATA_FLOAT);
if (status != INFINI_STATUS_SUCCESS) {
return status;
}
return utils::Result<Opaque>(std::move(opaque));
}
};
Descriptor::~Descriptor() {
if (_opaque) {
delete _opaque;
}
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
const void *pads,
const void *strides,
const void *dilations,
size_t n) {
auto handle = reinterpret_cast<device::cuda::nvidia::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
auto result = ConvInfo::create(handle_, y_desc, x_desc, w_desc, b_desc,
pads, strides, dilations, n);
CHECK_RESULT(result);
auto conv_info = result.take();
auto opaque_result = Opaque::create(handle->internal(), conv_info, dtype);
CHECK_RESULT(opaque_result);
auto opaque = new Opaque(std::move(opaque_result.take()));
*desc_ptr = new Descriptor(
dtype,
std::move(conv_info),
opaque->workspace_size,
opaque,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias,
void *stream) const {
const float alpha = 1.0f, beta = 0.0f;
if (bias != nullptr) {
CHECK_STATUS(_opaque->internal->useCudnn(
(cudaStream_t)stream, [&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnConvolutionBiasActivationForward(
handle,
&alpha,
_opaque->x_desc,
x,
_opaque->w_desc,
w,
_opaque->conv_desc,
_opaque->algo,
workspace, workspace_size,
&beta,
_opaque->y_desc,
y,
_opaque->b_desc,
bias,
_opaque->act_desc,
_opaque->y_desc,
y));
return INFINI_STATUS_SUCCESS;
}));
} else {
CHECK_STATUS(_opaque->internal->useCudnn(
(cudaStream_t)stream, [&](cudnnHandle_t handle) {
CHECK_CUDNN(cudnnConvolutionForward(
handle,
&alpha,
_opaque->x_desc,
x,
_opaque->w_desc,
w,
_opaque->conv_desc,
_opaque->algo,
workspace, workspace_size,
&beta,
_opaque->y_desc,
y));
return INFINI_STATUS_SUCCESS;
}));
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::conv::cuda
#ifndef __CONV_CUDA_CUH__
#define __CONV_CUDA_CUH__
#include "../../../devices/cuda/cuda_common.cuh"
#include "../conv.h"
DESCRIPTOR(cuda)
#endif // __GEMM_CUDA_CUH__
#ifndef __CONV_INFO_H__
#define __CONV_INFO_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#ifdef ENABLE_CUDA_API
#include "../../devices/cuda/cuda_handle.cuh"
#endif
namespace op::conv {
class ConvInfo;
} // namespace op::conv
namespace op::conv {
class ConvInfo {
private:
std::vector<size_t> _meta;
size_t _ndim;
size_t _batch;
size_t _in_channels;
size_t _out_channels;
size_t _spatial_sizes;
size_t _bias_dims_size;
size_t _padded_shape_size;
ConvInfo(std::vector<size_t> meta,
size_t ndim,
size_t batch,
size_t in_channels,
size_t out_channels,
size_t spatial_sizes,
size_t bias_dims_size,
size_t padded_shape_size)
: _meta(std::move(meta)),
_ndim(ndim),
_batch(batch),
_in_channels(in_channels),
_out_channels(out_channels),
_spatial_sizes(spatial_sizes),
_bias_dims_size(bias_dims_size),
_padded_shape_size(padded_shape_size) {}
public:
inline size_t ndim() const { return _ndim; }
inline size_t batch() const { return _batch; }
inline size_t in_channels() const { return _in_channels; }
inline size_t out_channels() const { return _out_channels; }
inline size_t spatial_sizes() const { return _spatial_sizes; }
inline size_t bias_dims_size() const { return _bias_dims_size; }
inline size_t padded_shape_size() const { return _padded_shape_size; }
inline size_t getMetaMemSize() const {
return _meta.size() * sizeof(size_t);
}
inline const int8_t *getMetaStart() const {
return reinterpret_cast<const int8_t *>(_meta.data());
}
inline const size_t *getInputDims() const {
return _meta.data();
}
inline const size_t *getKernelDims() const {
return getInputDims() + _ndim;
}
inline const size_t *getOutputDims() const {
return getKernelDims() + _ndim;
}
inline const size_t *getBiasDims() const {
return getOutputDims() + _ndim;
}
inline const size_t *getPadsInfo() const {
return getBiasDims() + _bias_dims_size;
}
inline const ptrdiff_t *getStridesInfo() const {
return reinterpret_cast<const ptrdiff_t *>(getPadsInfo()) + _ndim;
}
inline const size_t *getDilationsInfo() const {
return reinterpret_cast<const size_t *>(getStridesInfo()) + _ndim;
}
inline const size_t *getPaddedShape() const {
return getDilationsInfo() + _ndim;
}
inline size_t input_dim(size_t i) const {
return i < _ndim ? getInputDims()[i] : 0;
}
inline size_t kernel_dim(size_t i) const {
return i < _ndim ? getKernelDims()[i] : 0;
}
inline size_t output_dim(size_t i) const {
return i < _ndim ? getOutputDims()[i] : 0;
}
inline size_t bias_dim(size_t i) const {
return i < _bias_dims_size ? getBiasDims()[i] : 0;
}
inline size_t pad_info(size_t i) const {
return i < _ndim ? getPadsInfo()[i] : 0;
}
inline ptrdiff_t stride_info(size_t i) const {
return i < _ndim ? getStridesInfo()[i] : 0;
}
inline size_t dilation_info(size_t i) const {
return i < _ndim ? getDilationsInfo()[i] : 0;
}
inline size_t padded_shape_dim(size_t i) const {
return i < _padded_shape_size ? getPaddedShape()[i] : 0;
}
static utils::Result<ConvInfo> create(
infiniopHandle_t handle_,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
const void *pads,
const void *strides,
const void *dilations,
size_t n);
};
inline utils::Result<size_t> calculateConvOutputSize(
size_t input_size,
size_t kernel_size,
size_t padding,
size_t stride,
size_t dilation) {
if (stride == 0) {
return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
}
if (dilation == 0) {
return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
}
if (kernel_size == 0) {
return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
}
size_t effective_kernel = dilation * (kernel_size - 1) + 1;
size_t padded_input = input_size + 2 * padding;
if (padded_input < effective_kernel) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t output_size = (padded_input - effective_kernel) / stride + 1;
return utils::Result<size_t>(output_size);
}
inline utils::Result<ConvInfo> ConvInfo::create(
infiniopHandle_t handle_,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
const void *pads,
const void *strides,
const void *dilations,
size_t n) {
auto dtype = y_desc->dtype();
if (dtype != x_desc->dtype() || dtype != w_desc->dtype()) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
size_t ndim = n;
size_t new_dims = n + 2;
if (x_desc->ndim() < new_dims || y_desc->ndim() < new_dims || w_desc->ndim() < new_dims) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t batch = x_desc->shape()[0];
size_t in_channels = x_desc->shape()[1];
size_t out_channels = w_desc->shape()[0];
if (y_desc->shape()[0] != batch || y_desc->shape()[1] != out_channels || w_desc->shape()[1] != in_channels) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
size_t bias_dims_size = (b_desc != nullptr) ? x_desc->ndim() : 0;
const size_t *pads_ptr = reinterpret_cast<const size_t *>(pads);
bool has_padding = false;
if (pads_ptr != nullptr) {
for (size_t i = 0; i < ndim; ++i) {
if (pads_ptr[i] > 0) {
has_padding = true;
break;
}
}
}
size_t padded_shape_size = has_padding ? (ndim + 2) : 0;
// 计算meta总大小
size_t meta_size = ndim * 6 + bias_dims_size + padded_shape_size;
std::vector<size_t> meta(meta_size);
size_t *input_dims = meta.data();
size_t *kernel_dims = input_dims + ndim;
size_t *output_dims = kernel_dims + ndim;
size_t *bias_dims = output_dims + ndim;
size_t *pads_info = bias_dims + bias_dims_size;
ptrdiff_t *strides_info = reinterpret_cast<ptrdiff_t *>(pads_info) + ndim;
size_t *dilations_info = reinterpret_cast<size_t *>(strides_info) + ndim;
size_t *padded_shape = dilations_info + ndim;
const ptrdiff_t *strides_ptr = reinterpret_cast<const ptrdiff_t *>(strides);
const size_t *dilations_ptr = reinterpret_cast<const size_t *>(dilations);
size_t spatial_sizes = 1;
for (size_t i = 0; i < ndim; i++) {
input_dims[i] = x_desc->shape()[i + 2];
kernel_dims[i] = w_desc->shape()[i + 2];
output_dims[i] = y_desc->shape()[i + 2];
pads_info[i] = pads_ptr == nullptr ? 0 : pads_ptr[i];
strides_info[i] = strides_ptr == nullptr ? 1 : strides_ptr[i];
dilations_info[i] = dilations_ptr == nullptr ? 1 : dilations_ptr[i];
spatial_sizes = spatial_sizes * output_dims[i];
auto output_result = calculateConvOutputSize(
input_dims[i],
kernel_dims[i],
pads_info[i],
strides_info[i],
dilations_info[i]);
CHECK_RESULT(output_result);
size_t expected_output = output_result.take();
if (output_dims[i] != expected_output) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
}
if (bias_dims_size > 0) {
std::fill(bias_dims, bias_dims + bias_dims_size, 1);
bias_dims[1] = b_desc->shape()[0];
}
if (padded_shape_size > 0) {
padded_shape[0] = batch;
padded_shape[1] = in_channels;
for (size_t i = 0; i < ndim; ++i) {
padded_shape[i + 2] = input_dims[i] + 2 * pads_info[i];
}
}
ConvInfo info(std::move(meta), ndim, batch, in_channels, out_channels,
spatial_sizes, bias_dims_size, padded_shape_size);
return utils::Result<ConvInfo>(info);
}
} // namespace op::conv
#endif // __CONV_INFO_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/conv.h"
#ifdef ENABLE_CPU_API
#include "cpu/conv_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/conv_cuda.cuh"
#endif
__C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle,
infiniopConvDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
infiniopTensorDescriptor_t b_desc,
void *pads,
void *strides,
void *dilations,
size_t n) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::conv::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::conv::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc, \
w_desc, \
b_desc, \
pads, \
strides, \
dilations, \
n)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t
infiniopGetConvWorkspaceSize(
infiniopConvDescriptor_t desc,
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}
__C infiniStatus_t infiniopConv(
infiniopConvDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *w,
const void *bias,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, \
y, \
x, \
w, \
bias, \
stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::conv::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p
import torch
import ctypes
import sys
import os
import time
from ctypes import c_uint64
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from operatorspy import (
open_lib,
to_tensor,
DeviceEnum,
infiniopHandle_t,
infiniopTensorDescriptor_t,
create_handle,
destroy_handle,
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from operatorspy.tests.test_utils import get_args
import torch
from enum import Enum, auto
from typing import List, Tuple
import math
import ctypes
from torch.nn import functional as F
from typing import List, Tuple
# constant for control whether profile the pytorch and lib functions
# NOTE: need to manually add synchronization function to the lib function,
......@@ -29,36 +29,91 @@ from typing import List, Tuple
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
_TEST_CASES = [
# x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, x_strides
(
(32, 3, 4),
(12, 4, 1),
(32, 3, 5),
(15, 5, 1),
(1,),
(1,),
(1,),
),
(
(1, 3, 4, 4),
(48, 16, 4, 1),
(2, 3, 3, 3),
(27, 9, 3, 1),
(1, 1),
(1, 2),
(2, 1),
),
(
(32, 3, 32, 32),
(32 * 32 * 3, 32 * 32, 32, 1),
(64, 3, 5, 5),
(75, 25, 5, 1),
(2, 2),
(2, 2),
(1, 1),
),
(
(1, 1, 4, 4, 4),
(64, 64, 16, 4, 1),
(1, 1, 5, 5, 5),
(125, 125, 25, 5, 1),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
),
(
(32, 3, 32, 32, 32),
(32 * 32 * 32 * 3, 32 * 32 * 32, 32 * 32, 32, 1),
(64, 3, 5, 5, 5),
(375, 125, 25, 5, 1),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
),
]
class ConvDescriptor(Structure):
_fields_ = [("device", c_int32)]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
infiniopConvDescriptor_t = POINTER(ConvDescriptor)
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def conv(x, w, stride, padding, dilation):
def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
match len(x.shape) - 2:
case 1:
return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(F.conv1d(x, w, bias=bias, stride=stride, padding=padding, dilation=dilation))
case 2:
return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(F.conv2d(x, w, bias=bias, stride=stride, padding=padding, dilation=dilation))
case 3:
return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
y_tensor.copy_(F.conv3d(x, w, bias=bias, stride=stride, padding=padding, dilation=dilation))
case _:
print("Error: Pytorch -> Unsupported tensor dimension")
return None
# infer the shape of the output given the inputs for a N-ary convolution
def inferShape(
def inferShapeStride(
x_shape: List[int],
w_shape: List[int],
pads: List[int],
strides: List[int],
dilations: List[int],
) -> Tuple[int, ...]:
) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
assert (
len(x_shape)
== len(w_shape)
......@@ -74,7 +129,12 @@ def inferShape(
)
for i in range(len(pads))
]
return (x_shape[0], w_shape[0]) + tuple(output_dims)
output_shape = (x_shape[0], w_shape[0]) + tuple(output_dims)
output_strides = [1]
for s in reversed(output_shape[1:]):
output_strides.insert(0, output_strides[0] * s)
output_strides = tuple(output_strides)
return output_shape, output_strides
# convert a python tuple to a ctype void pointer
......@@ -85,52 +145,43 @@ def tuple_to_void_p(py_tuple: Tuple):
def test(
lib,
handle,
torch_device,
device,
x_shape,
x_stride,
w_shape,
w_stride,
pads,
strides,
dilations,
tensor_stride=None,
tensor_dtype=torch.float16,
sync=None
tensor_dtype=InfiniDtype.F16,
sync=None,
):
assert len(pads) == len(strides) == len(dilations)
x = TestTensor(x_shape, x_stride, dt=tensor_dtype, device=device, scale=0.01)
w = TestTensor(w_shape, w_stride, dt=tensor_dtype, device=device, scale=0.01)
y_shape, y_stride = inferShapeStride(x_shape, w_shape, pads, strides, dilations)
y = TestTensor(y_shape, y_stride, dt=tensor_dtype, device=device)
b = TestTensor((w.shape[0],), (1,), dt=tensor_dtype, device=device, scale=0.01) if w.shape[0] > 1 else None
print(
f"Testing Conv on {torch_device} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {tensor_stride} dtype:{tensor_dtype}"
f"Testing Conv on {InfiniDeviceNames[device]} with x_shape: {x_shape}, w_shape: {w_shape}, b_shape: {w_shape[0]}, pads: {pads}, strides: {strides}, dilations: {dilations}, x_stride: {x_stride} dtype:{tensor_dtype}"
f"dtype:{InfiniDtypeNames[tensor_dtype]}"
)
x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
w = torch.rand(w_shape, dtype=tensor_dtype).to(torch_device)
y = torch.zeros(
inferShape(x.shape, w.shape, pads, strides, dilations), dtype=tensor_dtype
).to(torch_device)
for i in range(NUM_PRERUN if PROFILE else 1):
ans = conv(x, w, strides, pads, dilations)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
_ = conv(x, w, strides, pads, dilations)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f"pytorch time: {elapsed :6f}")
x_tensor = to_tensor(x, lib)
w_tensor = to_tensor(w, lib)
y_tensor = to_tensor(y, lib)
conv(x.torch_tensor(), w.torch_tensor(), strides, pads, dilations, y.torch_tensor(), b.torch_tensor() if b is not None else None)
if sync is not None:
sync()
descriptor = infiniopConvDescriptor_t()
descriptor = infiniopOperatorDescriptor_t()
check_error(
lib.infiniopCreateConvDescriptor(
LIBINFINIOP.infiniopCreateConvDescriptor(
handle,
ctypes.byref(descriptor),
y_tensor.descriptor,
x_tensor.descriptor,
w_tensor.descriptor,
y.descriptor,
x.descriptor,
w.descriptor,
b.descriptor if b is not None else None,
tuple_to_void_p(pads),
tuple_to_void_p(strides),
tuple_to_void_p(dilations),
......@@ -139,169 +190,55 @@ def test(
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_tensor.descriptor.contents.invalidate()
w_tensor.descriptor.contents.invalidate()
y_tensor.descriptor.contents.invalidate()
for tensor in [x, y, w, b]:
if tensor is not None:
tensor.destroy_desc()
workspaceSize = ctypes.c_uint64(0)
workspace_size = ctypes.c_uint64(0)
check_error(
lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
)
workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
torch_device
LIBINFINIOP.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspace_size))
)
workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
workspace = TestWorkspace(workspace_size.value, y.device)
for i in range(NUM_PRERUN if PROFILE else 1):
def lib_conv():
check_error(
lib.infiniopConv(
LIBINFINIOP.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
workspace.data(),
workspace_size.value,
y.data(),
x.data(),
w.data(),
b.data() if b is not None else None,
None,
)
)
if PROFILE:
start_time = time.time()
for i in range(NUM_ITERATIONS):
check_error(
lib.infiniopConv(
descriptor,
workspace_ptr,
workspaceSize,
y_tensor.data,
x_tensor.data,
w_tensor.data,
None,
)
)
elapsed = (time.time() - start_time) / NUM_ITERATIONS
print(f" lib time: {elapsed :6f}")
if tensor_dtype == torch.float16:
assert torch.allclose(y, ans, atol=0, rtol=1e-2)
else:
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
check_error(lib.infiniopDestroyConvDescriptor(descriptor))
lib_conv()
atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
def test_cpu(lib, test_cases):
device = DeviceEnum.DEVICE_CPU
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
device = DeviceEnum.DEVICE_CUDA
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# fmt: off
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
# fmt: on
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
import torch_mlu
device = DeviceEnum.DEVICE_BANG
handle = create_handle(lib, device)
for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
# Profiling workflow
if PROFILE:
# fmt: off
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
profile_operation("PyTorch", lambda: conv(x.torch_tensor(), w.torch_tensor(), strides, pads, dilations, b.torch_tensor() if b is not None else None), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_conv(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
destroy_handle(lib, handle)
check_error(LIBINFINIOP.infiniopDestroyConvDescriptor(descriptor))
if __name__ == "__main__":
test_cases = [
# x_shape, w_shape, pads, strides, dilations, x_strides
(
(32, 3, 4),
(32, 3, 5),
(1,),
(1,),
(1,),
None,
),
(
(1, 3, 4, 4),
(2, 3, 3, 3),
(1, 1),
(1, 2),
(2, 1),
None,
),
(
(32, 3, 128, 128),
(64, 3, 5, 5),
(2, 2),
(2, 2),
(1, 1),
None,
),
(
(1, 1, 4, 4, 4),
(1, 1, 5, 5, 5),
(1, 1, 1),
(1, 1, 1),
(1, 1, 1),
None,
),
(
(32, 3, 32, 32, 32),
(64, 3, 5, 5, 5),
(3, 2, 2),
(4, 3, 3),
(2, 2, 1),
None,
),
]
args = get_args()
lib = open_lib()
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopConvDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_uint64,
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopConvDescriptor_t,
c_void_p,
c_uint64,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopConvDescriptor_t,
]
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if not (args.cpu or args.cuda or args.bang):
test_cpu(lib, test_cases)
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -453,3 +453,39 @@ def swiglu_(lib):
lib.infiniopDestroySwiGLUDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def conv_(lib):
lib.infiniopCreateConvDescriptor.restype = c_int32
lib.infiniopCreateConvDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
c_void_p,
c_void_p,
c_void_p,
c_size_t,
]
lib.infiniopGetConvWorkspaceSize.restype = c_int32
lib.infiniopGetConvWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopConv.restype = c_int32
lib.infiniopConv.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyConvDescriptor.restype = c_int32
lib.infiniopDestroyConvDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment