Unverified Commit 09d4b2ae authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1071 from InfiniTensor/issue/1031_T1-1-9

【算子比赛2025秋】T1-1-9
parents 5fc85c8b 85f8987c
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/asinh.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::asinh_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAsinhDescriptor_t> caches(
100, // capacity
[](infiniopAsinhDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAsinhDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor y, Tensor x) {
size_t seed = hash_combine(y, x);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopAsinhDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAsinhDescriptor(
context::getInfiniopHandle(y->device()), &desc,
y->desc(), x->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAsinhWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAsinh(
desc, workspace->data(), workspace_size,
y->data(), x->data(), context::getStream()));
}
static bool registered = []() {
Asinh::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::asinh_impl::infiniop
#include "infinicore/ops/baddbmm.hpp"
#include "infinicore/ops/gemm.hpp"
#include "infinicore/ops/rearrange.hpp"
namespace infinicore::op {
// 内联的 BLAS 兼容性检查,减少函数调用开销
inline bool is_blas_compatible(const Tensor &t) {
const auto ndim = t->ndim();
if (ndim == 2) {
const auto rs = t->stride(0);
const auto cs = t->stride(1);
if (rs != 1 && cs != 1) {
return false;
}
if (rs == 1 && cs == 1) {
return t->shape()[0] == 1 || t->shape()[1] == 1;
}
return true;
} else if (ndim == 3) {
const auto rs = t->stride(1);
const auto cs = t->stride(2);
if (t->shape()[0] > 1 && t->stride(0) == 0) {
return false;
}
if (rs != 1 && cs != 1) {
return false;
}
if (rs == 1 && cs == 1) {
return t->shape()[1] == 1 || t->shape()[2] == 1;
}
return true;
}
return false;
}
inline void prepare_gemm_input(Tensor &output, Tensor &input, const size_t batch_size, const size_t m, const size_t n) {
const auto input_ndim = input->ndim();
if (input_ndim == 2) {
rearrange_(output, input->as_strided(
{batch_size, m, n},
{0, input->stride(0), input->stride(1)}));
} else if (input_ndim == 3 && input->shape()[0] == 1 && batch_size > 1) {
rearrange_(output, input->as_strided(
{batch_size, m, n},
{0, input->stride(1), input->stride(2)}));
} else {
rearrange_(output, input);
}
}
Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2,
float beta,
float alpha) {
const size_t batch_size = batch1->shape()[0];
const size_t m = batch1->shape()[1];
const size_t n = batch2->shape()[2];
const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
if (beta == 0.0f) {
return gemm(a, b, alpha, 0.0f);
}
Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
prepare_gemm_input(result, input, batch_size, m, n);
gemm_(result, a, b, alpha, beta);
return result;
}
void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2,
float beta,
float alpha) {
const size_t batch_size = batch1->shape()[0];
const size_t m = batch1->shape()[1];
const size_t n = batch2->shape()[2];
const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
const bool out_is_usable = out->is_contiguous() && out->ndim() == 3 && out->shape()[0] == batch_size && out->shape()[1] == m && out->shape()[2] == n;
if (out_is_usable) {
if (beta != 0.0f && input->data() != out->data()) {
prepare_gemm_input(out, input, batch_size, m, n);
}
gemm_(out, a, b, alpha, beta);
} else {
Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
if (beta != 0.0f) {
prepare_gemm_input(result, input, batch_size, m, n);
}
gemm_(result, a, b, alpha, beta);
rearrange_(out, result);
}
}
} // namespace infinicore::op
#include "infinicore/ops/bilinear.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/matmul.hpp"
#include "infinicore/ops/rearrange.hpp"
namespace infinicore::op {
namespace {
inline bool is_gemm_compatible_3d(const Tensor &t) {
if (t->ndim() != 3) {
return false;
}
const auto batch = t->shape()[0];
const auto rows = t->shape()[1];
const auto cols = t->shape()[2];
const auto bs = t->stride(0);
const auto rs = t->stride(1);
const auto cs = t->stride(2);
if (rs != 1 && cs != 1) {
return false;
}
if (cs == 1) {
if (rs < static_cast<int64_t>(cols)) {
return false;
}
} else {
if (cs < static_cast<int64_t>(rows)) {
return false;
}
}
if (batch > 1 && bs == 0) {
return false;
}
return true;
}
inline Tensor ensure_gemm_compatible(const Tensor &t) {
if (t->ndim() == 2) {
return t->is_contiguous() ? t : rearrange(t);
} else if (t->ndim() == 3) {
return is_gemm_compatible_3d(t) ? t : rearrange(t);
}
return t->is_contiguous() ? t : rearrange(t);
}
} // anonymous namespace
Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
const size_t batch_size = x1->shape()[0];
const size_t in1_features = x1->shape()[1];
const size_t in2_features = x2->shape()[1];
const size_t out_features = weight->shape()[0];
Tensor x1_compat = ensure_gemm_compatible(x1);
Tensor x2_compat = ensure_gemm_compatible(x2);
Tensor weight_cont = weight->is_contiguous() ? weight : weight->contiguous();
Tensor weight_permuted = weight_cont->permute({1, 0, 2});
Tensor weight_permuted_cont = weight_permuted->is_contiguous()
? weight_permuted
: weight_permuted->contiguous();
Tensor weight_matrix = weight_permuted_cont->view({in1_features, out_features * in2_features});
Tensor intermediate = matmul(x1_compat, weight_matrix, 1.0f);
Tensor intermediate_3d = intermediate->view({batch_size, out_features, in2_features});
Tensor intermediate_transposed = intermediate_3d->permute({0, 2, 1});
Tensor intermediate_compat = ensure_gemm_compatible(intermediate_transposed);
Tensor x2_row = x2_compat->view({batch_size, 1, in2_features});
Tensor x2_row_compat = ensure_gemm_compatible(x2_row);
Tensor out_3d = matmul(x2_row_compat, intermediate_compat, 1.0f);
Tensor out = out_3d->view({batch_size, out_features});
if (bias) {
Tensor bias_broadcast = (*bias)->as_strided(
{batch_size, out_features},
{0, (*bias)->strides()[0]});
out = add(out, bias_broadcast);
}
return out;
}
void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
Tensor result = bilinear(x1, x2, weight, bias);
rearrange_(out, result);
}
} // namespace infinicore::op
#include "infinicore/ops/fmod.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<Fmod::schema> &Fmod::dispatcher() {
static common::OpDispatcher<Fmod::schema> dispatcher_;
return dispatcher_;
};
void Fmod::execute(Tensor c, Tensor a, Tensor b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
dispatcher().lookup(c->device().getType())(c, a, b);
}
Tensor fmod(Tensor a, Tensor b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
fmod_(c, a, b);
return c;
}
void fmod_(Tensor c, Tensor a, Tensor b) {
Fmod::execute(c, a, b);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/fmod.hpp"
#include <infiniop.h>
namespace infinicore::op::fmod_impl::infiniop {
thread_local common::OpCache<size_t, infiniopFmodDescriptor_t> caches(
100, // capacity
[](infiniopFmodDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyFmodDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor c, Tensor a, Tensor b) {
size_t seed = hash_combine(c, b, a);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopFmodDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateFmodDescriptor(
context::getInfiniopHandle(c->device()), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetFmodWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopFmod(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), context::getStream()));
}
static bool registered = []() {
Fmod::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::fmod_impl::infiniop
......@@ -2,13 +2,17 @@
#include <pybind11/pybind11.h>
#include "ops/adaptive_max_pool1d.hpp"
#include "ops/add.hpp"
#include "ops/add_rms_norm.hpp"
#include "ops/addcmul.hpp"
#include "ops/all.hpp"
#include "ops/asinh.hpp"
#include "ops/atanh.hpp"
#include "ops/attention.hpp"
#include "ops/avg_pool1d.hpp"
#include "ops/baddbmm.hpp"
#include "ops/bilinear.hpp"
#include "ops/binary_cross_entropy_with_logits.hpp"
#include "ops/causal_softmax.hpp"
#include "ops/cdist.hpp"
......@@ -16,6 +20,7 @@
#include "ops/embedding.hpp"
#include "ops/equal.hpp"
#include "ops/flash_attention.hpp"
#include "ops/fmod.hpp"
#include "ops/hardswish.hpp"
#include "ops/hardtanh.hpp"
#include "ops/kv_caching.hpp"
......@@ -46,12 +51,18 @@ namespace py = pybind11;
namespace infinicore::ops {
inline void bind(py::module &m) {
bind_adaptive_max_pool1d(m);
bind_add(m);
bind_add_rms_norm(m);
bind_attention(m);
bind_asinh(m);
bind_baddbmm(m);
bind_bilinear(m);
bind_causal_softmax(m);
bind_flash_attention(m);
bind_kv_caching(m);
bind_fmod(m);
bind_random_sample(m);
bind_linear(m);
bind_matmul(m);
bind_mul(m);
......
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/adaptive_max_pool1d.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_adaptive_max_pool1d(py::module &m) {
m.def("adaptive_max_pool1d",
&op::adaptive_max_pool1d,
py::arg("x"),
py::arg("output_size"),
R"doc(1D Adaptive Max Pooling.
Args:
x: Input tensor of shape (N, C, L_in) or (N, L_in)
output_size: Target output size L_out
Returns:
Output tensor of shape (N, C, L_out) or (N, L_out)
)doc");
m.def("adaptive_max_pool1d_",
&op::adaptive_max_pool1d_,
py::arg("y"),
py::arg("x"),
py::arg("output_size"),
R"doc(In-place 1D Adaptive Max Pooling.
Args:
y: Output tensor of shape (N, C, L_out) or (N, L_out)
x: Input tensor of shape (N, C, L_in) or (N, L_in)
output_size: Target output size L_out
)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/asinh.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_asinh(py::module &m) {
m.def("asinh",
&op::asinh,
py::arg("x"),
R"doc(Element-wise inverse hyperbolic sine function.)doc");
m.def("asinh_",
&op::asinh_,
py::arg("y"),
py::arg("x"),
R"doc(In-place element-wise inverse hyperbolic sine function.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/baddbmm.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_baddbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
return op::baddbmm(input, batch1, batch2, beta, alpha);
}
void py_baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
op::baddbmm_(out, input, batch1, batch2, beta, alpha);
}
inline void bind_baddbmm(py::module &m) {
m.def("baddbmm",
&py_baddbmm,
py::arg("input"),
py::arg("batch1"),
py::arg("batch2"),
py::arg("beta") = 1.0f,
py::arg("alpha") = 1.0f,
R"doc(Batched matrix-matrix product with addition.
Args:
input: Input tensor
batch1: First batch of matrices
batch2: Second batch of matrices
beta: Scaling factor for input tensor
alpha: Scaling factor for the product of batch1 and batch2
Returns:
Output tensor after baddbmm operation
)doc");
m.def("baddbmm_",
&py_baddbmm_,
py::arg("out"),
py::arg("input"),
py::arg("batch1"),
py::arg("batch2"),
py::arg("beta") = 1.0f,
py::arg("alpha") = 1.0f,
R"doc(In-place batched matrix-matrix product with addition.
Args:
out: Output tensor
input: Input tensor
batch1: First batch of matrices
batch2: Second batch of matrices
beta: Scaling factor for input tensor
alpha: Scaling factor for the product of batch1 and batch2
)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/bilinear.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_bilinear(Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
std::optional<Tensor> bias_tensor = std::nullopt;
if (!bias.is_none()) {
bias_tensor = bias.cast<Tensor>();
}
return op::bilinear(x1, x2, weight, bias_tensor);
}
void py_bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
std::optional<Tensor> bias_tensor = std::nullopt;
if (!bias.is_none()) {
bias_tensor = bias.cast<Tensor>();
}
op::bilinear_(out, x1, x2, weight, bias_tensor);
}
inline void bind_bilinear(py::module &m) {
m.def("bilinear",
&py_bilinear,
py::arg("x1"),
py::arg("x2"),
py::arg("weight"),
py::arg("bias"),
R"doc(Bilinear transformation of two input tensors.
Args:
x1: First input tensor
x2: Second input tensor
weight: Weight tensor
bias: Bias tensor (optional)
Returns:
Output tensor after bilinear transformation
)doc");
m.def("bilinear_",
&py_bilinear_,
py::arg("out"),
py::arg("x1"),
py::arg("x2"),
py::arg("weight"),
py::arg("bias"),
R"doc(In-place bilinear transformation of two input tensors.
Args:
out: Output tensor
x1: First input tensor
x2: Second input tensor
weight: Weight tensor
bias: Bias tensor (optional)
)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/fmod.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_fmod(py::module &m) {
m.def("fmod",
&op::fmod,
py::arg("a"),
py::arg("b"),
R"doc(Element-wise floating point remainder of division of two tensors.)doc");
m.def("fmod_",
&op::fmod_,
py::arg("c"),
py::arg("a"),
py::arg("b"),
R"doc(In-place element-wise floating point remainder of division of two tensors.)doc");
}
} // namespace infinicore::ops
#ifndef ADAPTIVE_MAX_POOL1D_H
#define ADAPTIVE_MAX_POOL1D_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::adaptive_max_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AdaptiveMaxPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AdaptiveMaxPool1dInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t output_size); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // ADAPTIVE_MAX_POOL1D_H
#include "adaptive_max_pool1d_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include <algorithm>
#include <cmath>
namespace op::adaptive_max_pool1d::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) {
const size_t ndim = info->ndim();
const size_t batch_size = info->shape[0];
const size_t channels = ndim > 2 ? info->shape[1] : 1;
const size_t input_length = info->input_length();
const size_t output_length = info->output_length();
// 计算总的任务块数 (Batch * Channels)
const ptrdiff_t total_blocks = static_cast<ptrdiff_t>(batch_size * channels);
const ptrdiff_t x_stride_last = info->x_strides.back();
#pragma omp parallel for
for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) {
const size_t i = block_idx / channels; // batch index
const size_t j = block_idx % channels; // channel index
const T *x_ptr_base;
T *y_ptr_base;
if (ndim > 2) { // (N, C, L)
x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1];
y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1];
} else { // (N, L)
x_ptr_base = x + i * info->x_strides[0];
y_ptr_base = y + i * info->y_strides[0];
}
for (size_t out_idx = 0; out_idx < output_length; ++out_idx) {
size_t start_index = (out_idx * input_length) / output_length;
size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length;
start_index = std::max(start_index, size_t(0));
end_index = std::min(end_index, input_length);
size_t window_len = end_index - start_index;
if (window_len <= 0) {
continue;
}
const T *window_ptr = x_ptr_base + start_index * x_stride_last;
auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last);
y_ptr_base[out_idx] = utils::cast<T>(max_val);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (_info.atype == INFINI_DTYPE_F32) {
return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x);
} else if (_info.atype == INFINI_DTYPE_F16) {
return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x);
} else if (_info.atype == INFINI_DTYPE_BF16) {
return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x);
} else if (_info.atype == INFINI_DTYPE_F64) {
return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x);
}
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} // namespace op::adaptive_max_pool1d::cpu
#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
#define __ADAPTIVE_MAX_POOL1D_CPU_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(cpu)
#endif
#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#include <cmath>
#include <limits>
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ void adaptiveMaxPool1dBlock(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
size_t block_idx = blockIdx.x;
size_t batch_idx = block_idx / channels;
size_t channel_idx = block_idx % channels;
const Tdata *x_ptr;
Tdata *y_ptr;
if (ndim > 2) {
x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel;
y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel;
} else {
x_ptr = x + batch_idx * stride_x_batch;
y_ptr = y + batch_idx * stride_y_batch;
}
for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) {
int start_index = static_cast<int>(floorf((float)out_idx * input_length / output_length));
int end_index = static_cast<int>(ceilf((float)(out_idx + 1) * input_length / output_length));
if (end_index <= start_index) {
continue;
}
Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]);
for (int i = start_index + 1; i < end_index; ++i) {
Tcompute val = Tcompute(x_ptr[i * stride_x_length]);
max_val = max(max_val, val);
}
y_ptr[out_idx] = Tdata(max_val);
}
}
#endif
#ifndef __ADAPATIVE_MAX_POOL1D_H__
#define __ADAPATIVE_MAX_POOL1D_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::adaptive_max_pool1d {
class AdaptiveMaxPool1dInfo {
AdaptiveMaxPool1dInfo() = default;
public:
infiniDtype_t atype;
std::vector<size_t> shape;
std::vector<ptrdiff_t> y_strides;
std::vector<ptrdiff_t> x_strides;
size_t input_size;
size_t output_size;
size_t ndim() const { return shape.size(); }
size_t input_length() const { return input_size; }
size_t output_length() const { return output_size; }
static utils::Result<AdaptiveMaxPool1dInfo> create(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto atype = y_desc->dtype();
if (x_desc->dtype() != atype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
const size_t y_ndim = y_desc->ndim();
const size_t x_ndim = x_desc->ndim();
if (y_ndim != x_ndim) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
for (size_t i = 0; i < y_ndim - 1; ++i) {
if (x_desc->dim(i) != y_desc->dim(i)) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
}
if (y_desc->dim(y_ndim - 1) != output_size) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
return utils::Result<AdaptiveMaxPool1dInfo>(AdaptiveMaxPool1dInfo{
atype,
y_desc->shape(),
y_desc->strides(),
x_desc->strides(),
x_desc->dim(x_ndim - 1),
output_size});
}
};
} // namespace op::adaptive_max_pool1d
#endif // __ADAPATIVE_MAX_POOL1D_H__
#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(metax)
#endif
#include "../../../devices/metax/metax_common.h"
#include "adaptive_max_pool1d_metax.cuh"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim);
}
namespace op::adaptive_max_pool1d::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t numblock,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
hcStream_t stream) {
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__hpcc_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto stream = reinterpret_cast<hcStream_t>(stream_);
if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::metax
#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(moore)
#endif
#include "../../../devices/moore/moore_common.h"
#include "adaptive_max_pool1d_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim);
}
namespace op::adaptive_max_pool1d::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t num_blocks,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
musaStream_t musa_stream) {
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__mt_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::moore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment