Unverified Commit 93191613 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1075 from InfiniTensor/RevertT_1-1-4

Revert T1-1-4
parents 6ab911c3 def22a08
#ifndef __INFINIOP_VAR_API_H__
#define __INFINIOP_VAR_API_H__
#include "../operator_descriptor.h"
#include <cstddef>
#include <vector>
typedef struct InfiniopDescriptor *infiniopVarDescriptor_t;
__INFINI_C __export infiniStatus_t infiniopCreateVarDescriptor(infiniopHandle_t handle,
infiniopVarDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim);
__INFINI_C __export infiniStatus_t infiniopGetVarWorkspaceSize(infiniopVarDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopVar(infiniopVarDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *var_output,
const void *input,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim,
void *stream);
__INFINI_C __export infiniStatus_t infiniopDestroyVarDescriptor(infiniopVarDescriptor_t desc);
#endif
#ifndef __INFINIOP_VAR_MEAN_API_H__
#define __INFINIOP_VAR_MEAN_API_H__
#include "../operator_descriptor.h"
#include <cstddef>
#include <vector>
typedef struct InfiniopDescriptor *infiniopVarMeanDescriptor_t;
__INFINI_C __export infiniStatus_t infiniopCreateVarMeanDescriptor(infiniopHandle_t handle,
infiniopVarMeanDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t mean_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim);
__INFINI_C __export infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size);
__INFINI_C __export infiniStatus_t infiniopVarMean(infiniopVarMeanDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *var_output,
void *mean_output,
const void *input,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim,
void *stream);
__INFINI_C __export infiniStatus_t infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc);
#endif
...@@ -49,10 +49,7 @@ from infinicore.dtype import ( ...@@ -49,10 +49,7 @@ from infinicore.dtype import (
) )
from infinicore.ops.add import add from infinicore.ops.add import add
from infinicore.ops.add_rms_norm import add_rms_norm from infinicore.ops.add_rms_norm import add_rms_norm
from infinicore.ops.all import all
from infinicore.ops.attention import attention from infinicore.ops.attention import attention
from infinicore.ops.cross_entropy import cross_entropy
from infinicore.ops.equal import equal
from infinicore.ops.kv_caching import kv_caching from infinicore.ops.kv_caching import kv_caching
from infinicore.ops.matmul import matmul from infinicore.ops.matmul import matmul
from infinicore.ops.mha_kvcache import mha_kvcache from infinicore.ops.mha_kvcache import mha_kvcache
...@@ -64,11 +61,7 @@ from infinicore.ops.paged_attention_prefill import paged_attention_prefill ...@@ -64,11 +61,7 @@ from infinicore.ops.paged_attention_prefill import paged_attention_prefill
from infinicore.ops.paged_caching import paged_caching from infinicore.ops.paged_caching import paged_caching
from infinicore.ops.rearrange import rearrange from infinicore.ops.rearrange import rearrange
from infinicore.ops.squeeze import squeeze from infinicore.ops.squeeze import squeeze
from infinicore.ops.sum import sum
from infinicore.ops.topk import topk
from infinicore.ops.unsqueeze import unsqueeze from infinicore.ops.unsqueeze import unsqueeze
from infinicore.ops.var import var
from infinicore.ops.var_mean import var_mean
from infinicore.tensor import ( from infinicore.tensor import (
Tensor, Tensor,
empty, empty,
...@@ -127,22 +120,16 @@ __all__ = [ ...@@ -127,22 +120,16 @@ __all__ = [
"uint8", "uint8",
# Operators. # Operators.
"add", "add",
"addcmul",
"add_rms_norm", "add_rms_norm",
"add_rms_norm_", "add_rms_norm_",
"atanh",
"attention", "attention",
"binary_cross_entropy_with_logits",
"cdist",
"kv_caching", "kv_caching",
"matmul", "matmul",
"equal",
"mul", "mul",
"narrow", "narrow",
"squeeze", "squeeze",
"unsqueeze", "unsqueeze",
"rearrange", "rearrange",
"cross_entropy",
"empty", "empty",
"empty_like", "empty_like",
"from_blob", "from_blob",
...@@ -155,15 +142,9 @@ __all__ = [ ...@@ -155,15 +142,9 @@ __all__ = [
"paged_attention", "paged_attention",
"paged_attention_prefill", "paged_attention_prefill",
"ones", "ones",
"reciprocal",
"strided_empty", "strided_empty",
"strided_from_blob", "strided_from_blob",
"zeros", "zeros",
"sum",
"var_mean",
"var",
"topk",
"all",
] ]
use_ntops = False use_ntops = False
......
from .avg_pool1d import avg_pool1d
from .causal_softmax import causal_softmax from .causal_softmax import causal_softmax
from .embedding import embedding from .embedding import embedding
from .flash_attention import flash_attention from .flash_attention import flash_attention
from .hardswish import hardswish
from .hardtanh import hardtanh
from .linear import linear from .linear import linear
from .linear_w8a8i8 import linear_w8a8i8 from .linear_w8a8i8 import linear_w8a8i8
from .random_sample import random_sample from .random_sample import random_sample
...@@ -23,9 +20,6 @@ __all__ = [ ...@@ -23,9 +20,6 @@ __all__ = [
"RopeAlgo", "RopeAlgo",
"rope", "rope",
"silu", "silu",
"hardswish",
"hardtanh",
"avg_pool1d",
"swiglu", "swiglu",
"linear_w8a8i8", "linear_w8a8i8",
"silu_and_mul", "silu_and_mul",
......
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def avg_pool1d(
input: Tensor,
kernel_size: int,
stride: int | None = None,
padding: int = 0,
*,
out=None,
) -> Tensor:
if stride is None:
stride = 0
if out is None:
return Tensor(
_infinicore.avg_pool1d(input._underlying, kernel_size, stride, padding)
)
_infinicore.avg_pool1d_(
out._underlying, input._underlying, kernel_size, stride, padding
)
return out
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def hardswish(input: Tensor, inplace: bool = False, *, out=None) -> Tensor:
r"""Apply the Hardswish activation function element-wise."""
if (
infinicore.use_ntops
and input.device.type in ("cuda", "musa")
and out is None
and hasattr(infinicore.ntops.torch, "hardswish")
):
try:
return infinicore.ntops.torch.hardswish(input, inplace=inplace)
except AttributeError:
pass
if inplace:
_infinicore.hardswish_(input._underlying, input._underlying)
return input
if out is None:
return Tensor(_infinicore.hardswish(input._underlying))
_infinicore.hardswish_(out._underlying, input._underlying)
return out
import infinicore
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def hardtanh(
input: Tensor,
min_val: float = -1.0,
max_val: float = 1.0,
inplace: bool = False,
*,
out=None,
) -> Tensor:
"""Clamp the input tensor to the range [min_val, max_val]."""
if min_val > max_val:
raise ValueError("min_val must be less than or equal to max_val")
if (
infinicore.use_ntops
and input.device.type in ("cuda", "musa")
and out is None
and hasattr(infinicore.ntops.torch, "hardtanh")
):
try:
return infinicore.ntops.torch.hardtanh(
input, min_val=min_val, max_val=max_val, inplace=inplace
)
except AttributeError:
pass
if inplace:
_infinicore.hardtanh_(
input._underlying, input._underlying, float(min_val), float(max_val)
)
return input
if out is None:
return Tensor(
_infinicore.hardtanh(input._underlying, float(min_val), float(max_val))
)
_infinicore.hardtanh_(
out._underlying, input._underlying, float(min_val), float(max_val)
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def all(input, dim=None, keepdim=False, out=None):
if out is None:
return Tensor(_infinicore.all(input._underlying, dim, keepdim))
_infinicore.all_(out._underlying, input._underlying, dim, keepdim)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def cross_entropy(
logits,
target,
weight=None,
*,
ignore_index=None,
reduction="none",
out=None,
):
"""
Token-wise cross entropy without reduction. The output tensor has the same
shape as target and uses the logits dtype.
"""
if weight is not None:
raise NotImplementedError("class weights are not supported yet.")
if ignore_index is not None:
raise NotImplementedError("ignore_index is not supported yet.")
if reduction not in (None, "none"):
raise NotImplementedError("Only reduction='none' is implemented.")
if out is None:
return Tensor(_infinicore.cross_entropy(logits._underlying, target._underlying))
_infinicore.cross_entropy_(
out._underlying,
logits._underlying,
target._underlying,
)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def equal(input, other, *, out=None):
if out is None:
return Tensor(_infinicore.equal(input._underlying, other._underlying))
_infinicore.equal_(out._underlying, input._underlying, other._underlying)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def sum(input, dim=None, keepdim=False, out=None):
"""
Sum the elements of the input tensor along the given dimensions.
Args:
input (Tensor): The input tensor.
out (Tensor, optional): The output tensor.
Returns:
Tensor: The output tensor.
Example:
>>> import infinicore
>>> input = infinicore.tensor([[1, 2, 3], [4, 5, 6]])
>>> output = infinicore.sum(input)
>>> print(output)
tensor([15])
"""
if out is None:
return Tensor(_infinicore.sum(input._underlying, dim, keepdim))
_infinicore.sum_(out._underlying, input._underlying, dim, keepdim)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def topk(input, k, dim, largest=True, sorted=True, out=None):
if out is None:
values, indices = _infinicore.topk(input._underlying, k, dim, largest, sorted)
return Tensor(values), Tensor(indices)
_infinicore.topk_(out._underlying, input._underlying, k, dim, largest, sorted)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def var(input, dim=None, unbiased=True, keepdim=False, out=None):
if out is None:
var_tensor = _infinicore.var(input._underlying, dim, unbiased, keepdim)
return Tensor(var_tensor)
var_output = out
_infinicore.var_(var_output._underlying, input._underlying, dim, unbiased, keepdim)
return out
from infinicore.lib import _infinicore
from infinicore.tensor import Tensor
def var_mean(input, dim=None, unbiased=True, keepdim=False, out=None):
if out is None:
var_tensor, mean_tensor = _infinicore.var_mean(
input._underlying, dim, unbiased, keepdim
)
return Tensor(var_tensor), Tensor(mean_tensor)
var_output, mean_output = out
_infinicore.var_mean_(
var_output._underlying,
mean_output._underlying,
input._underlying,
dim,
unbiased,
keepdim,
)
return out
import ml_dtypes
import numpy as np import numpy as np
import torch import torch
import infinicore import infinicore
try:
import ml_dtypes
except ModuleNotFoundError:
ml_dtypes = None
def to_torch_dtype(infini_dtype): def to_torch_dtype(infini_dtype):
"""Convert infinicore data type to PyTorch data type""" """Convert infinicore data type to PyTorch data type"""
...@@ -61,9 +57,7 @@ def numpy_to_infinicore_dtype(numpy_dtype): ...@@ -61,9 +57,7 @@ def numpy_to_infinicore_dtype(numpy_dtype):
return infinicore.float64 return infinicore.float64
elif numpy_dtype == np.float16: elif numpy_dtype == np.float16:
return infinicore.float16 return infinicore.float16
elif hasattr(np, "bfloat16") and numpy_dtype == np.bfloat16: elif numpy_dtype == ml_dtypes.bfloat16:
return infinicore.bfloat16
elif ml_dtypes is not None and numpy_dtype == ml_dtypes.bfloat16:
return infinicore.bfloat16 return infinicore.bfloat16
elif numpy_dtype == np.int8: elif numpy_dtype == np.int8:
return infinicore.int8 return infinicore.int8
...@@ -92,13 +86,6 @@ def infinicore_to_numpy_dtype(infini_dtype): ...@@ -92,13 +86,6 @@ def infinicore_to_numpy_dtype(infini_dtype):
elif infini_dtype == infinicore.int16: elif infini_dtype == infinicore.int16:
return np.int16 return np.int16
elif infini_dtype == infinicore.bfloat16: elif infini_dtype == infinicore.bfloat16:
if hasattr(np, "bfloat16"):
return np.bfloat16
if ml_dtypes is None:
raise ModuleNotFoundError(
"ml_dtypes is required for bfloat16 numpy conversion. "
"Please install ml_dtypes."
)
return ml_dtypes.bfloat16 return ml_dtypes.bfloat16
elif infini_dtype == infinicore.int32: elif infini_dtype == infinicore.int32:
return np.int32 return np.int32
......
...@@ -17,12 +17,12 @@ def run_tests(args): ...@@ -17,12 +17,12 @@ def run_tests(args):
"causal_softmax.py", "causal_softmax.py",
"clip.py", "clip.py",
"conv.py", "conv.py",
# "dequantize_awq.py", #"dequantize_awq.py",
"gelu.py", "gelu.py",
"gemm.py", "gemm.py",
# "layer_norm.py", #"layer_norm.py",
"logsoftmax.py", "logsoftmax.py",
# "lp_norm.py", #"lp_norm.py",
"mul.py", "mul.py",
"ones.py", "ones.py",
"random_sample.py", "random_sample.py",
...@@ -31,7 +31,7 @@ def run_tests(args): ...@@ -31,7 +31,7 @@ def run_tests(args):
"rms_norm.py", "rms_norm.py",
"rope.py", "rope.py",
"sigmoid.py", "sigmoid.py",
# "softmax.py", #"softmax.py",
"softplus.py", "softplus.py",
"sub.py", "sub.py",
"swiglu.py", "swiglu.py",
...@@ -42,7 +42,6 @@ def run_tests(args): ...@@ -42,7 +42,6 @@ def run_tests(args):
# "paged_attention.py", # "paged_attention.py",
# "paged_caching.py", # "paged_caching.py",
# "paged_attention_prefill.py" # "paged_attention_prefill.py"
"cross_entropy.py",
]: ]:
result = subprocess.run( result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/all.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::all_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAllDescriptor_t> caches(
100, // capacity
[](infiniopAllDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAllDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
size_t seed = hash_combine(output, input, dim.size(), keepdim);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopAllDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAllDescriptor(
context::getInfiniopHandle(output->device()), &desc,
output->desc(), input->desc(), dim.data(), dim.size(), keepdim));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAllWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAll(
desc, workspace->data(), workspace_size,
output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream()));
}
static bool registered = []() {
All::dispatcher().registerDevice({Device::Type::CPU,
Device::Type::NVIDIA,
Device::Type::METAX,
Device::Type::MOORE,
Device::Type::ILUVATAR},
&calculate, false);
return true;
}();
} // namespace infinicore::op::all_impl::infiniop
#include "infinicore/ops/all.hpp"
#include "../../utils.hpp"
#include <iostream>
#include <stdexcept>
#include <vector>
namespace infinicore::op {
common::OpDispatcher<All::schema> &All::dispatcher() {
static common::OpDispatcher<All::schema> dispatcher_;
return dispatcher_;
};
void All::execute(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
infinicore::context::setDevice(input->device());
auto device_type = context::getDevice().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No All implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, dim, keepdim);
}
Tensor all(Tensor input, std::vector<size_t> dim, bool keepdim) {
auto in_shape = input->shape();
std::vector<size_t> out_shape;
if (dim.empty()) {
for (size_t i = 0; i < in_shape.size(); i++) {
dim.push_back(i);
}
}
std::sort(dim.begin(), dim.end());
if (dim.size() == in_shape.size() && !keepdim) {
out_shape = {};
} else {
if (keepdim) {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
out_shape.push_back(1);
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
} else {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
}
}
auto output = Tensor::empty(out_shape, DataType::BOOL, input->device());
all_(output, input, dim, keepdim);
return output;
}
void all_(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
All::execute(output, input, dim, keepdim);
}
} // namespace infinicore::op
#include "infinicore/ops/avg_pool1d.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
return dispatcher_;
}
void AvgPool1d::execute(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
if (stride == 0) {
stride = kernel_size;
}
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error(
"No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, kernel_size, stride, padding);
}
Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
const auto &shape = input->shape();
if (shape.size() != 3) {
throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
}
const size_t n = shape[0];
const size_t c = shape[1];
const size_t l_in = shape[2];
if (l_in + 2 * padding < kernel_size) {
throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
}
const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
Shape out_shape = {n, c, out_width};
auto output = Tensor::empty(out_shape, input->dtype(), input->device());
avg_pool1d_(output, input, kernel_size, stride, padding);
return output;
}
void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
AvgPool1d::execute(output, input, kernel_size, stride, padding);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/avg_pool1d.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::avg_pool1d_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
100,
[](infiniopAvgPool1dDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
desc = nullptr;
}
});
void calculate(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
size_t seed = hash_combine(output, input, kernel_size, stride, padding);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopAvgPool1dDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc(),
kernel_size,
stride,
padding));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
desc,
workspace->data(),
workspace_size,
output->data(),
input->data(),
context::getStream()));
}
static bool registered = []() {
AvgPool1d::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::avg_pool1d_impl::infiniop
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment