Unverified Commit 7f295448 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #1056 from InfiniTensor/issue/1031

【算子比赛2025秋】T1-1-9 & T1-1-17
parents e60985dc 0391d018
...@@ -49,7 +49,13 @@ from infinicore.dtype import ( ...@@ -49,7 +49,13 @@ from infinicore.dtype import (
) )
from infinicore.ops.add import add from infinicore.ops.add import add
from infinicore.ops.add_rms_norm import add_rms_norm from infinicore.ops.add_rms_norm import add_rms_norm
from infinicore.ops.asinh import asinh
from infinicore.ops.attention import attention from infinicore.ops.attention import attention
from infinicore.ops.baddbmm import baddbmm
from infinicore.ops.bilinear import bilinear
from infinicore.ops.cross_entropy import cross_entropy
from infinicore.ops.equal import equal
from infinicore.ops.fmod import fmod
from infinicore.ops.kv_caching import kv_caching from infinicore.ops.kv_caching import kv_caching
from infinicore.ops.matmul import matmul from infinicore.ops.matmul import matmul
from infinicore.ops.mha_varlen import mha_varlen from infinicore.ops.mha_varlen import mha_varlen
...@@ -123,12 +129,18 @@ __all__ = [ ...@@ -123,12 +129,18 @@ __all__ = [
"add_rms_norm_", "add_rms_norm_",
"attention", "attention",
"kv_caching", "kv_caching",
"asinh",
"baddbmm",
"bilinear",
"fmod",
"matmul", "matmul",
"equal",
"mul", "mul",
"narrow", "narrow",
"squeeze", "squeeze",
"unsqueeze", "unsqueeze",
"rearrange", "rearrange",
"cross_entropy",
"empty", "empty",
"empty_like", "empty_like",
"from_blob", "from_blob",
......
from .adaptive_max_pool1d import adaptive_max_pool1d
from .avg_pool1d import avg_pool1d
from .causal_softmax import causal_softmax from .causal_softmax import causal_softmax
from .embedding import embedding from .embedding import embedding
from .flash_attention import flash_attention from .flash_attention import flash_attention
from .hardswish import hardswish
from .hardtanh import hardtanh
from .linear import linear from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8 from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample from .random_sample import random_sample
...@@ -11,6 +15,7 @@ from .silu_and_mul import silu_and_mul ...@@ -11,6 +15,7 @@ from .silu_and_mul import silu_and_mul
from .swiglu import swiglu from .swiglu import swiglu
__all__ = [ __all__ = [
"adaptive_max_pool1d",
"causal_softmax", "causal_softmax",
"embedding", "embedding",
"flash_attention", "flash_attention",
...@@ -20,6 +25,9 @@ __all__ = [ ...@@ -20,6 +25,9 @@ __all__ = [
"RopeAlgo", "RopeAlgo",
"rope", "rope",
"silu", "silu",
"hardswish",
"hardtanh",
"avg_pool1d",
"swiglu", "swiglu",
"linear_w8a8i8", "linear_w8a8i8",
"silu_and_mul", "silu_and_mul",
......
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def adaptive_max_pool1d(
input: Tensor,
output_size: int,
*,
out=None,
) -> Tensor:
r"""Applies a 1D adaptive max pooling over an input signal composed of
several input planes.
The output size is H_out. The algorithm used is fairly simple:
.. math::
\text{start} = \left\lfloor \frac{i \cdot L_{in}}{L_{out}} \right\rfloor
\text{end} = \left\lceil \frac{(i + 1) \cdot L_{in}}{L_{out}} \right\rceil
where :math:`L_{in}` is the size of the input dimension, and :math:`L_{out}` is the size of the output dimension.
Args:
input (Tensor): Input tensor of shape (N, C, L_in)
output_size (int): The target output size (L_out)
out (Tensor, optional): Output tensor.
Returns:
Tensor: The result of the adaptive max pooling operation.
"""
if out is None:
return Tensor(_infinicore.adaptive_max_pool1d(input._underlying, output_size))
_infinicore.adaptive_max_pool1d_(out._underlying, input._underlying, output_size)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def avg_pool1d(
input: Tensor,
kernel_size: int,
stride: int | None = None,
padding: int = 0,
*,
out=None,
) -> Tensor:
if stride is None:
stride = 0
if out is None:
return Tensor(
_infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding)
)
_infinicore.avg_pool1d_(
out._underlying, input._underlying, kernel_size, stride, padding
)
return out
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor:
r"""Apply the Hardswish activation function element-wise."""
if (
infinicore.use_ntops
and input.device.type in ("cuda", "musa")
and out is None
and hasattr(infinicore.ntops.torch, "hardswish")
):
try:
return infinicore.ntops.torch.hardswish(input, inplace=inplace)
except AttributeError:
pass
if inplace:
_infinicore.hardswish_(input._underlying, input._underlying)
return input
if out is None:
return Tensor(_infinicore.hardswish(input._underlying))
_infinicore.hardswish_(out._underlying, input._underlying)
return out
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def hardtanh(
input: Tensor,
min_val: float = -1.0,
max_val: float = 1.0,
inplace: bool = False,
*,
out=None,
) -> Tensor:
"""Clamp the input tensor to the range [min_val, max_val]."""
if min_val > max_val:
raise ValueError("min_val must be less than or equal to max_val")
if (
infinicore.use_ntops
and input.device.type in ("cuda", "musa")
and out is None
and hasattr(infinicore.ntops.torch, "hardtanh")
):
try:
return infinicore.ntops.torch.hardtanh(
input, min_val=min_val, max_val=max_val, inplace=inplace
)
except AttributeError:
pass
if inplace:
_infinicore.hardtanh_(
input._underlying, input._underlying, float(min_val), float(max_val)
)
return input
if out is None:
return Tensor(
_infinicore.hardtanh(input._underlying, float(min_val), float(max_val))
)
_infinicore.hardtanh_(
out._underlying, input._underlying, float(min_val), float(max_val)
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def asinh(input, *, out=None):
if out is None:
return Tensor(_infinicore.asinh(input._underlying))
_infinicore.asinh_(out._underlying, input._underlying)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def baddbmm(input, batch1, batch2, *, beta=1.0, alpha=1.0, out=None):
if out is None:
return Tensor(
_infinicore.baddbmm(
input._underlying,
batch1._underlying,
batch2._underlying,
float(beta),
float(alpha),
)
)
_infinicore.baddbmm_(
out._underlying,
input._underlying,
batch1._underlying,
batch2._underlying,
float(beta),
float(alpha),
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def bilinear(input1, input2, weight, bias=None, *, out=None):
if out is None:
return Tensor(
_infinicore.bilinear(
input1._underlying,
input2._underlying,
weight._underlying,
bias._underlying if bias is not None else None,
)
)
_infinicore.bilinear_(
out._underlying,
input1._underlying,
input2._underlying,
weight._underlying,
bias._underlying if bias is not None else None,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def cross_entropy(
logits,
target,
weight=None,
*,
ignore_index=None,
reduction="none",
out=None,
):
"""
Token-wise cross entropy without reduction. The output tensor has the same
shape as target and uses the logits dtype.
"""
if weight is not None:
raise NotImplementedError("class weights are not supported yet.")
if ignore_index is not None:
raise NotImplementedError("ignore_index is not supported yet.")
if reduction not in (None, "none"):
raise NotImplementedError("Only reduction='none' is implemented.")
if out is None:
return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying))
_infinicore.cross_entropy_(
out._underlying,
logits._underlying,
target._underlying,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def equal(input, other, *, out=None):
if out is None:
return Tensor(_infinicore.equal(input._underlying, other._underlying))
_infinicore.equal_(out._underlying, input._underlying, other._underlying)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def fmod(input, other, *, out=None):
if out is None:
return Tensor(_infinicore.fmod(input._underlying, other._underlying))
_infinicore.fmod_(out._underlying, input._underlying, other._underlying)
return out
import ml_dtypes
import numpy as np import numpy as np
import torch import torch
import infinicore import infinicore
try:
import ml_dtypes
except ModuleNotFoundError:
ml_dtypes = None
def to_torch_dtype(infini_dtype): def to_torch_dtype(infini_dtype):
"""Convert infinicore data type to PyTorch data type""" """Convert infinicore data type to PyTorch data type"""
...@@ -57,7 +61,9 @@ def numpy_to_infinicore_dtype(numpy_dtype): ...@@ -57,7 +61,9 @@ def numpy_to_infinicore_dtype(numpy_dtype):
return infinicore.float64 return infinicore.float64
elif numpy_dtype == np.float16: elif numpy_dtype == np.float16:
return infinicore.float16 return infinicore.float16
elif numpy_dtype == ml_dtypes.bfloat16: elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16:
return infinicore.bfloat16
elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16:
return infinicore.bfloat16 return infinicore.bfloat16
elif numpy_dtype == np.int8: elif numpy_dtype == np.int8:
return infinicore.int8 return infinicore.int8
...@@ -86,6 +92,13 @@ def infinicore_to_numpy_dtype(infini_dtype): ...@@ -86,6 +92,13 @@ def infinicore_to_numpy_dtype(infini_dtype):
elif infini_dtype == infinicore.int16: elif infini_dtype == infinicore.int16:
return np.int16 return np.int16
elif infini_dtype == infinicore.bfloat16: elif infini_dtype == infinicore.bfloat16:
if hasattr(np, "bfloat16"):
return np.bfloat16
if ml_dtypes is None:
raise ModuleNotFoundError(
"ml_dtypes is required for bfloat16 numpy conversion. "
"Please install ml_dtypes."
)
return ml_dtypes.bfloat16 return ml_dtypes.bfloat16
elif infini_dtype == infinicore.int32: elif infini_dtype == infinicore.int32:
return np.int32 return np.int32
......
...@@ -17,12 +17,12 @@ def run_tests(args): ...@@ -17,12 +17,12 @@ def run_tests(args):
"causal_softmax.py", "causal_softmax.py",
"clip.py", "clip.py",
"conv.py", "conv.py",
#"dequantize_awq.py", # "dequantize_awq.py",
"gelu.py", "gelu.py",
"gemm.py", "gemm.py",
#"layer_norm.py", # "layer_norm.py",
"logsoftmax.py", "logsoftmax.py",
#"lp_norm.py", # "lp_norm.py",
"mul.py", "mul.py",
"ones.py", "ones.py",
"random_sample.py", "random_sample.py",
...@@ -31,7 +31,7 @@ def run_tests(args): ...@@ -31,7 +31,7 @@ def run_tests(args):
"rms_norm.py", "rms_norm.py",
"rope.py", "rope.py",
"sigmoid.py", "sigmoid.py",
#"softmax.py", # "softmax.py",
"softplus.py", "softplus.py",
"sub.py", "sub.py",
"swiglu.py", "swiglu.py",
...@@ -42,6 +42,7 @@ def run_tests(args): ...@@ -42,6 +42,7 @@ def run_tests(args):
# "paged_attention.py", # "paged_attention.py",
# "paged_caching.py", # "paged_caching.py",
# "paged_attention_prefill.py" # "paged_attention_prefill.py"
"cross_entropy.py",
]: ]:
result = subprocess.run( result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
#include "infinicore/ops/adaptive_max_pool1d.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<AdaptiveMaxPool1d::schema> &AdaptiveMaxPool1d::dispatcher() {
static common::OpDispatcher<AdaptiveMaxPool1d::schema> dispatcher_;
return dispatcher_;
}
void AdaptiveMaxPool1d::execute(Tensor y, Tensor x, size_t output_size) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
infinicore::context::setDevice(y->device());
dispatcher().lookup(y->device().getType())(y, x, output_size);
}
Tensor adaptive_max_pool1d(Tensor x, size_t output_size) {
infinicore::Shape y_shape = x->shape();
y_shape.back() = output_size;
auto y = Tensor::empty(y_shape, x->dtype(), x->device());
adaptive_max_pool1d_(y, x, output_size);
return y;
}
void adaptive_max_pool1d_(Tensor y, Tensor x, size_t output_size) {
AdaptiveMaxPool1d::execute(y, x, output_size);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/adaptive_max_pool1d.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::adaptive_max_pool1d_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAdaptiveMaxPool1dDescriptor_t> caches(
100, // capacity
[](infiniopAdaptiveMaxPool1dDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAdaptiveMaxPool1dDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor y, Tensor x, size_t out) {
size_t seed = hash_combine(y, x, out);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopAdaptiveMaxPool1dDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAdaptiveMaxPool1dDescriptor(
context::getInfiniopHandle(y->device()), &desc,
y->desc(), x->desc(), out));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAdaptiveMaxPool1dWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAdaptiveMaxPool1d(
desc, workspace->data(), workspace_size,
y->data(), x->data(), context::getStream()));
}
static bool registered = []() {
AdaptiveMaxPool1d::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::adaptive_max_pool1d_impl::infiniop
#include "infinicore/ops/asinh.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<Asinh::schema> &Asinh::dispatcher() {
static common::OpDispatcher<Asinh::schema> dispatcher_;
return dispatcher_;
};
void Asinh::execute(Tensor y, Tensor x) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
infinicore::context::setDevice(y->device());
dispatcher().lookup(y->device().getType())(y, x);
}
Tensor asinh(Tensor x) {
auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
asinh_(y, x);
return y;
}
void asinh_(Tensor y, Tensor x) {
Asinh::execute(y, x);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/asinh.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::asinh_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAsinhDescriptor_t> caches(
100, // capacity
[](infiniopAsinhDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAsinhDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor y, Tensor x) {
size_t seed = hash_combine(y, x);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopAsinhDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAsinhDescriptor(
context::getInfiniopHandle(y->device()), &desc,
y->desc(), x->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAsinhWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAsinh(
desc, workspace->data(), workspace_size,
y->data(), x->data(), context::getStream()));
}
static bool registered = []() {
Asinh::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::asinh_impl::infiniop
#include "infinicore/ops/avg_pool1d.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
return dispatcher_;
}
void AvgPool1d::execute(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
if (stride == 0) {
stride = kernel_size;
}
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error(
"No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, kernel_size, stride, padding);
}
Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
const auto &shape = input->shape();
if (shape.size() != 3) {
throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
}
const size_t n = shape[0];
const size_t c = shape[1];
const size_t l_in = shape[2];
if (l_in + 2 * padding < kernel_size) {
throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
}
const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
Shape out_shape = {n, c, out_width};
auto output = Tensor::empty(out_shape, input->dtype(), input->device());
avg_pool1d_(output, input, kernel_size, stride, padding);
return output;
}
void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
AvgPool1d::execute(output, input, kernel_size, stride, padding);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/avg_pool1d.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::avg_pool1d_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
100,
[](infiniopAvgPool1dDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
desc = nullptr;
}
});
void calculate(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
size_t seed = hash_combine(output, input, kernel_size, stride, padding);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopAvgPool1dDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc(),
kernel_size,
stride,
padding));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
desc,
workspace->data(),
workspace_size,
output->data(),
input->data(),
context::getStream()));
}
static bool registered = []() {
AvgPool1d::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::avg_pool1d_impl::infiniop
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment