Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
93191613
Unverified
Commit
93191613
authored
Mar 13, 2026
by
thatPepe
Committed by
GitHub
Mar 13, 2026
Browse files
Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
parents
6ab911c3
def22a08
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1291 deletions
+0
-1291
src/infinicore/pybind11/ops/equal.hpp
src/infinicore/pybind11/ops/equal.hpp
+0
-26
src/infinicore/pybind11/ops/hardswish.hpp
src/infinicore/pybind11/ops/hardswish.hpp
+0
-24
src/infinicore/pybind11/ops/hardtanh.hpp
src/infinicore/pybind11/ops/hardtanh.hpp
+0
-28
src/infinicore/pybind11/ops/sum.hpp
src/infinicore/pybind11/ops/sum.hpp
+0
-60
src/infinicore/pybind11/ops/topk.hpp
src/infinicore/pybind11/ops/topk.hpp
+0
-54
src/infinicore/pybind11/ops/var.hpp
src/infinicore/pybind11/ops/var.hpp
+0
-62
src/infinicore/pybind11/ops/var_mean.hpp
src/infinicore/pybind11/ops/var_mean.hpp
+0
-63
src/infiniop/ops/all/all_desc.h
src/infiniop/ops/all/all_desc.h
+0
-53
src/infiniop/ops/all/cpu/all_cpu.cc
src/infiniop/ops/all/cpu/all_cpu.cc
+0
-77
src/infiniop/ops/all/cpu/all_cpu.h
src/infiniop/ops/all/cpu/all_cpu.h
+0
-8
src/infiniop/ops/all/cuda/kernel.cuh
src/infiniop/ops/all/cuda/kernel.cuh
+0
-98
src/infiniop/ops/all/info.h
src/infiniop/ops/all/info.h
+0
-66
src/infiniop/ops/all/metax/all_metax.h
src/infiniop/ops/all/metax/all_metax.h
+0
-8
src/infiniop/ops/all/metax/all_metax.maca
src/infiniop/ops/all/metax/all_metax.maca
+0
-117
src/infiniop/ops/all/moore/all_moore.h
src/infiniop/ops/all/moore/all_moore.h
+0
-8
src/infiniop/ops/all/moore/all_moore.mu
src/infiniop/ops/all/moore/all_moore.mu
+0
-117
src/infiniop/ops/all/nvidia/all_nvidia.cu
src/infiniop/ops/all/nvidia/all_nvidia.cu
+0
-117
src/infiniop/ops/all/nvidia/all_nvidia.cuh
src/infiniop/ops/all/nvidia/all_nvidia.cuh
+0
-8
src/infiniop/ops/all/operator.cc
src/infiniop/ops/all/operator.cc
+0
-194
src/infiniop/ops/avg_pool1d/avg_pool1d.h
src/infiniop/ops/avg_pool1d/avg_pool1d.h
+0
-103
No files found.
src/infinicore/pybind11/ops/equal.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/equal.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_equal
(
py
::
module
&
m
)
{
m
.
def
(
"equal"
,
&
op
::
equal
,
py
::
arg
(
"a"
),
py
::
arg
(
"b"
),
R"doc(Elementwise equality returning a bool tensor.)doc"
);
m
.
def
(
"equal_"
,
&
op
::
equal_
,
py
::
arg
(
"out"
),
py
::
arg
(
"a"
),
py
::
arg
(
"b"
),
R"doc(In-place elementwise equality writing into `out`.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/hardswish.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardswish.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_hardswish
(
py
::
module
&
m
)
{
m
.
def
(
"hardswish"
,
&
op
::
hardswish
,
py
::
arg
(
"input"
),
R"doc(Out-of-place Hardswish activation.)doc"
);
m
.
def
(
"hardswish_"
,
&
op
::
hardswish_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
R"doc(In-place Hardswish activation.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/hardtanh.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardtanh.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_hardtanh
(
py
::
module
&
m
)
{
m
.
def
(
"hardtanh"
,
&
op
::
hardtanh
,
py
::
arg
(
"input"
),
py
::
arg
(
"min_val"
)
=
-
1.0
f
,
py
::
arg
(
"max_val"
)
=
1.0
f
,
R"doc(Apply the HardTanh activation.)doc"
);
m
.
def
(
"hardtanh_"
,
&
op
::
hardtanh_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"min_val"
)
=
-
1.0
f
,
py
::
arg
(
"max_val"
)
=
1.0
f
,
R"doc(In-place HardTanh activation.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/sum.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/sum.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_sum
(
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
sum
(
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
sum
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
sum
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_sum_
(
Tensor
output
,
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
sum_
(
output
,
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
sum_
(
output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
sum_
(
output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
inline
void
bind_sum
(
py
::
module
&
m
)
{
m
.
def
(
"sum"
,
&
py_sum
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(Sum of input tensor along the given dimensions.)doc"
);
m
.
def
(
"sum_"
,
&
py_sum_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor sum.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/topk.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // 添加这行
#include "infinicore/ops/topk.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
std
::
pair
<
Tensor
,
Tensor
>
py_topk
(
Tensor
input
,
size_t
k
,
int
dim
,
bool
largest
,
bool
sorted
)
{
if
(
dim
==
-
1
)
{
return
op
::
topk
(
input
,
k
,
input
->
ndim
()
-
1
,
largest
,
sorted
);
}
else
if
(
dim
>=
0
)
{
return
op
::
topk
(
input
,
k
,
static_cast
<
size_t
>
(
dim
),
largest
,
sorted
);
}
else
{
throw
std
::
invalid_argument
(
"invalid argument: dim"
);
}
}
void
py_topk_
(
Tensor
values_output
,
Tensor
indices_output
,
Tensor
input
,
size_t
k
,
int
dim
,
bool
largest
,
bool
sorted
)
{
if
(
dim
==
-
1
)
{
op
::
topk_
(
values_output
,
indices_output
,
input
,
k
,
input
->
ndim
()
-
1
,
largest
,
sorted
);
}
else
if
(
dim
>=
0
)
{
op
::
topk_
(
values_output
,
indices_output
,
input
,
k
,
static_cast
<
size_t
>
(
dim
),
largest
,
sorted
);
}
else
{
throw
std
::
invalid_argument
(
"invalid argument: dim"
);
}
}
inline
void
bind_topk
(
py
::
module
&
m
)
{
m
.
def
(
"topk"
,
&
py_topk
,
py
::
arg
(
"input"
),
py
::
arg
(
"k"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"largest"
),
py
::
arg
(
"sorted"
),
R"doc(topk of input tensor along the given dimensions.)doc"
);
m
.
def
(
"topk_"
,
&
py_topk_
,
py
::
arg
(
"values_output"
),
py
::
arg
(
"indices_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"k"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"largest"
),
py
::
arg
(
"sorted"
),
R"doc(In-place tensor topk_.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/var.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_var
(
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
var
(
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
var
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
var
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_var_
(
Tensor
var_output
,
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
var_
(
var_output
,
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
var_
(
var_output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
var_
(
var_output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a list/tuple or an integer"
);
}
}
inline
void
bind_var
(
py
::
module
&
m
)
{
m
.
def
(
"var"
,
&
py_var
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(Var of input tensor along the given dimensions.)doc"
);
m
.
def
(
"var_"
,
&
py_var_
,
py
::
arg
(
"var_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor Var .)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/var_mean.hpp
deleted
100644 → 0
View file @
6ab911c3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var_mean.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
std
::
pair
<
Tensor
,
Tensor
>
py_var_mean
(
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
var_mean
(
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
var_mean
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
var_mean
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_var_mean_
(
Tensor
var_output
,
Tensor
mean_output
,
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a list/tuple or an integer"
);
}
}
inline
void
bind_var_mean
(
py
::
module
&
m
)
{
m
.
def
(
"var_mean"
,
&
py_var_mean
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(Var & Mean of input tensor along the given dimensions.)doc"
);
m
.
def
(
"var_mean_"
,
&
py_var_mean_
,
py
::
arg
(
"var_output"
),
py
::
arg
(
"mean_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor Var & Mean .)doc"
);
}
}
// namespace infinicore::ops
src/infiniop/ops/all/all_desc.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef INFINIOP_ALL_DESCRIPTOR_H_
#define INFINIOP_ALL_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::all::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AllInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AllInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
size_t *dim, \
size_t dim_size, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
src/infiniop/ops/all/cpu/all_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "all_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include <iostream>
namespace
op
::
all
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
result
=
AllInfo
::
create
(
output_desc
,
input_desc
,
dim
,
dim_size
,
keepdim
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
typename
Tdata
>
infiniStatus_t
calculateAll
(
const
AllInfo
&
info
,
bool
*
output
,
const
Tdata
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
if
(
info
.
reduce_dim_size
==
info
.
ndim
)
{
bool
result
=
true
;
for
(
size_t
index
=
0
;
index
<
info
.
input_size
;
index
++
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
index
,
info
.
ndim
,
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
result
=
result
&&
input
[
input_offset
];
}
output
[
0
]
=
result
;
return
INFINI_STATUS_SUCCESS
;
}
else
{
for
(
size_t
i
=
info
.
output_size
;
i
--
>
0
;)
{
size_t
output_offset
=
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
output_shape
.
size
(),
info
.
output_shape
.
data
(),
info
.
output_strides
.
data
());
bool
result
=
true
;
for
(
size_t
j
=
0
;
j
<
info
.
reduce_num
;
j
++
)
{
size_t
input_flat
=
j
+
i
*
info
.
reduce_num
;
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
input_flat
,
info
.
ndim
,
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
Tdata
input_val
=
input
[
input_offset
];
bool
bool_val
=
static_cast
<
bool
>
(
input_val
);
result
=
result
&&
bool_val
;
}
output
[
output_offset
]
=
result
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_BOOL
:
return
calculateAll
<
bool
>
(
_info
,
reinterpret_cast
<
bool
*>
(
output
),
reinterpret_cast
<
const
bool
*>
(
input
),
dim
,
dim_size
,
keepdim
);
case
INFINI_DTYPE_U8
:
return
calculateAll
<
uint8_t
>
(
_info
,
reinterpret_cast
<
bool
*>
(
output
),
reinterpret_cast
<
const
uint8_t
*>
(
input
),
dim
,
dim_size
,
keepdim
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::all::cpu
src/infiniop/ops/all/cpu/all_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __INFINIOP_ALL_CPU_H__
#define __INFINIOP_ALL_CPU_H__
#include "../all_desc.h"
DESCRIPTOR
(
cpu
);
#endif // __INFINIOP_ALL_CPU_H__
src/infiniop/ops/all/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __ALL_CUDA_H__
#define __ALL_CUDA_H__
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
>
__global__
void
allReduceTempKernel
(
bool
*
temp_output
,
const
Tdata
*
input
,
size_t
input_size
,
size_t
permuted_input_shape_size
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
)
{
__shared__
bool
s_data
[
BLOCK_SIZE
];
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
input_size
)
{
size_t
input_offset
=
indexToOffset
(
idx
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
s_data
[
tid
]
=
static_cast
<
bool
>
(
input
[
input_offset
]);
}
else
{
s_data
[
tid
]
=
true
;
}
__syncthreads
();
for
(
size_t
s
=
blockDim
.
x
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
s_data
[
tid
]
=
s_data
[
tid
]
&&
s_data
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
temp_output
[
blockIdx
.
x
]
=
s_data
[
0
];
}
}
template
<
size_t
BLOCK_SIZE
>
__global__
void
finalAllReduceKernel
(
bool
*
output
,
const
bool
*
block_results
,
size_t
num_blocks
)
{
__shared__
bool
s_data
[
BLOCK_SIZE
];
size_t
tid
=
threadIdx
.
x
;
bool
thread_val
=
true
;
for
(
size_t
i
=
tid
;
i
<
num_blocks
;
i
+=
blockDim
.
x
)
{
thread_val
=
thread_val
&&
block_results
[
i
];
}
s_data
[
tid
]
=
thread_val
;
__syncthreads
();
for
(
size_t
s
=
BLOCK_SIZE
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
s_data
[
tid
]
=
s_data
[
tid
]
&&
s_data
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
output
=
s_data
[
0
];
}
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
>
__global__
void
allKernel
(
bool
*
output
,
const
Tdata
*
input
,
size_t
permuted_input_shape_size
,
size_t
output_shape_size
,
size_t
output_size
,
size_t
reduce_num
,
size_t
*
permuted_input_shape
,
size_t
*
output_shape
,
ptrdiff_t
*
permuted_input_strides
,
ptrdiff_t
*
output_strides
)
{
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
>=
output_size
)
{
return
;
}
size_t
output_index
=
indexToOffset
(
idx
,
output_shape_size
,
output_shape
,
output_strides
);
bool
tempRes
=
true
;
for
(
size_t
i
=
0
;
i
<
reduce_num
;
i
++
)
{
size_t
input_offset
=
indexToOffset
(
i
+
idx
*
reduce_num
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
tempRes
=
tempRes
&&
static_cast
<
bool
>
(
input
[
input_offset
]);
}
output
[
output_index
]
=
tempRes
;
}
#endif // __ALL_CUDA_H__
src/infiniop/ops/all/info.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __ALL_INFO_H__
#define __ALL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace
op
::
all
{
class
AllInfo
{
AllInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
permuted_input_shape
;
// need to permute
std
::
vector
<
size_t
>
output_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
// need to permute
std
::
vector
<
ptrdiff_t
>
output_strides
;
size_t
reduce_dim_size
;
// reduce dim size
size_t
reduce_num
;
// number of elements to reduce for each output element
size_t
input_size
;
// total number of input elements
size_t
output_size
;
// total number of output elements
size_t
ndim
;
// number of dimensions
static
utils
::
Result
<
AllInfo
>
create
(
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
input_strides
=
input_desc
->
strides
();
size_t
input_ndim
=
input_desc
->
ndim
();
size_t
reduce_num
=
1
;
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
reduce_num
*=
input_shape
[
dim
[
i
]];
}
std
::
vector
<
size_t
>
permute_order
;
for
(
size_t
i
=
0
;
i
<
input_ndim
;
i
++
)
{
if
(
std
::
find
(
dim
,
dim
+
dim_size
,
i
)
==
dim
+
dim_size
)
{
permute_order
.
push_back
(
i
);
}
}
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
permute_order
.
push_back
(
dim
[
i
]);
}
std
::
vector
<
size_t
>
permuted_input_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
for
(
size_t
i
=
0
;
i
<
permute_order
.
size
();
i
++
)
{
permuted_input_shape
.
push_back
(
input_shape
[
permute_order
[
i
]]);
permuted_input_strides
.
push_back
(
input_strides
[
permute_order
[
i
]]);
}
return
utils
::
Result
<
AllInfo
>
(
AllInfo
{
input_desc
->
dtype
(),
permuted_input_shape
,
output_desc
->
shape
(),
permuted_input_strides
,
output_desc
->
strides
(),
dim_size
,
reduce_num
,
input_desc
->
numel
(),
output_desc
->
numel
(),
input_ndim
});
}
};
}
// namespace op::all
#endif
src/infiniop/ops/all/metax/all_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __ALL_METAX_H__
#define __ALL_METAX_H__
#include "../all_desc.h"
DESCRIPTOR
(
metax
);
#endif
src/infiniop/ops/all/metax/all_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_metax.h"
namespace op::all::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_METAX(hcFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::metax
src/infiniop/ops/all/moore/all_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __ALL_MOORE_H__
#define __ALL_MOORE_H__
#include "../all_desc.h"
DESCRIPTOR
(
moore
);
#endif
src/infiniop/ops/all/moore/all_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_moore.h"
namespace op::all::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_MOORE(musaMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_MOORE(musaFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::moore
src/infiniop/ops/all/nvidia/all_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "all_nvidia.cuh"
namespace
op
::
all
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
result
=
AllInfo
::
create
(
output_desc
,
input_desc
,
dim
,
dim_size
,
keepdim
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
size_t
workspace_size
=
0
;
workspace_size
+=
(
input_desc
->
ndim
()
+
output_desc
->
ndim
())
*
(
sizeof
(
size_t
)
+
sizeof
(
ptrdiff_t
));
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
info
,
workspace_size
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
>
infiniStatus_t
launchKernel
(
const
AllInfo
&
info
,
bool
*
output
,
const
Tdata
*
input
,
cudaStream_t
stream
,
void
*
workspace
,
size_t
workspace_size
)
{
size_t
input_ndim
=
info
.
permuted_input_shape
.
size
();
size_t
output_ndim
=
info
.
output_shape
.
size
();
size_t
input_size
=
info
.
input_size
;
size_t
output_size
=
info
.
output_size
;
size_t
reduce_num
=
info
.
reduce_num
;
unsigned
char
*
workspace_ptr
=
reinterpret_cast
<
unsigned
char
*>
(
workspace
);
size_t
workspace_offset
=
0
;
size_t
*
permuted_input_shape_cuda
=
reinterpret_cast
<
size_t
*>
(
workspace_ptr
+
workspace_offset
);
size_t
*
output_shape_cuda
=
permuted_input_shape_cuda
+
input_ndim
;
workspace_offset
+=
(
input_ndim
+
output_ndim
)
*
sizeof
(
size_t
);
ptrdiff_t
*
permuted_input_strides_cuda
=
reinterpret_cast
<
ptrdiff_t
*>
(
workspace_ptr
+
workspace_offset
);
ptrdiff_t
*
output_strides_cuda
=
permuted_input_strides_cuda
+
input_ndim
;
workspace_offset
+=
(
input_ndim
+
output_ndim
)
*
sizeof
(
ptrdiff_t
);
CHECK_CUDA
(
cudaMemcpyAsync
(
permuted_input_shape_cuda
,
info
.
permuted_input_shape
.
data
(),
input_ndim
*
sizeof
(
size_t
),
cudaMemcpyHostToDevice
,
stream
));
CHECK_CUDA
(
cudaMemcpyAsync
(
output_shape_cuda
,
info
.
output_shape
.
data
(),
output_ndim
*
sizeof
(
size_t
),
cudaMemcpyHostToDevice
,
stream
));
CHECK_CUDA
(
cudaMemcpyAsync
(
permuted_input_strides_cuda
,
info
.
permuted_input_strides
.
data
(),
input_ndim
*
sizeof
(
ptrdiff_t
),
cudaMemcpyHostToDevice
,
stream
));
CHECK_CUDA
(
cudaMemcpyAsync
(
output_strides_cuda
,
info
.
output_strides
.
data
(),
output_ndim
*
sizeof
(
ptrdiff_t
),
cudaMemcpyHostToDevice
,
stream
));
if
(
info
.
reduce_num
==
input_size
)
{
size_t
grid_size
=
(
input_size
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
;
bool
*
temp_output
;
CHECK_CUDA
(
cudaMalloc
(
&
temp_output
,
grid_size
*
sizeof
(
bool
)));
allReduceTempKernel
<
BLOCK_SIZE
,
Tdata
><<<
grid_size
,
BLOCK_SIZE
,
BLOCK_SIZE
*
sizeof
(
bool
),
stream
>>>
(
temp_output
,
input
,
input_size
,
input_ndim
,
permuted_input_shape_cuda
,
permuted_input_strides_cuda
);
finalAllReduceKernel
<
BLOCK_SIZE
><<<
1
,
BLOCK_SIZE
>>>
(
output
,
temp_output
,
grid_size
);
CHECK_CUDA
(
cudaFree
(
temp_output
));
}
else
{
size_t
grid_size
=
(
info
.
output_size
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
;
allKernel
<
BLOCK_SIZE
,
Tdata
><<<
grid_size
,
BLOCK_SIZE
,
0
,
stream
>>>
(
output
,
input
,
input_ndim
,
output_ndim
,
output_size
,
reduce_num
,
permuted_input_shape_cuda
,
output_shape_cuda
,
permuted_input_strides_cuda
,
output_strides_cuda
);
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
>=
256
)
{
CALCULATE_ALL_WITH_BLOCK_SIZE
(
256
)
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::all::nvidia
src/infiniop/ops/all/nvidia/all_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __ALL_NVIDIA_H__
#define __ALL_NVIDIA_H__
#include "../all_desc.h"
DESCRIPTOR
(
nvidia
);
#endif // __ALL_CUDA_API_H__
src/infiniop/ops/all/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/all.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/all_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/all_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/all_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/all_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/all_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateAllDescriptor
(
infiniopHandle_t
handle
,
infiniopAllDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::all::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::all::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
input_desc, \
dim, \
dim_size, \
keepdim)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetAllWorkspaceSize
(
infiniopAllDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::all::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopAll
(
infiniopAllDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, input, dim, dim_size, keepdim, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyAllDescriptor
(
infiniopAllDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/avg_pool1d/avg_pool1d.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __AVG_POOL1D_H__
#define __AVG_POOL1D_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "infiniop/ops/avg_pool1d.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::avg_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AvgPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
AvgPool1dInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t kernel_size, \
size_t stride, \
size_t padding); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
class
AvgPool1dInfo
{
private:
AvgPool1dInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
size_t
batch
,
channels
,
in_width
,
out_width
;
size_t
kernel_size
,
stride
,
padding
;
ptrdiff_t
y_stride_batch
,
y_stride_channel
,
y_stride_width
;
ptrdiff_t
x_stride_batch
,
x_stride_channel
,
x_stride_width
;
static
utils
::
Result
<
AvgPool1dInfo
>
createAvgPool1dInfo
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
)
{
CHECK_OR_RETURN
(
y_desc
!=
nullptr
&&
x_desc
!=
nullptr
,
INFINI_STATUS_NULL_POINTER
);
const
infiniDtype_t
dtype
=
y_desc
->
dtype
();
CHECK_OR_RETURN
(
dtype
==
x_desc
->
dtype
(),
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_OR_RETURN
(
y_desc
->
ndim
()
==
3
&&
x_desc
->
ndim
()
==
3
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
size_t
batch
=
x_desc
->
dim
(
0
);
size_t
channels
=
x_desc
->
dim
(
1
);
size_t
in_width
=
x_desc
->
dim
(
2
);
CHECK_OR_RETURN
(
y_desc
->
dim
(
0
)
==
batch
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
y_desc
->
dim
(
1
)
==
channels
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
size_t
padded_len
=
in_width
+
2
*
padding
;
CHECK_OR_RETURN
(
padded_len
>=
kernel_size
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
size_t
expected_out_width
=
(
padded_len
-
kernel_size
)
/
stride
+
1
;
CHECK_OR_RETURN
(
y_desc
->
dim
(
2
)
==
expected_out_width
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
size_t
out_width
=
expected_out_width
;
return
utils
::
Result
<
AvgPool1dInfo
>
(
AvgPool1dInfo
{
dtype
,
batch
,
channels
,
in_width
,
out_width
,
kernel_size
,
stride
,
padding
,
y_desc
->
stride
(
0
),
y_desc
->
stride
(
1
),
y_desc
->
stride
(
2
),
x_desc
->
stride
(
0
),
x_desc
->
stride
(
1
),
x_desc
->
stride
(
2
)});
}
};
#endif
Prev
1
2
3
4
5
6
7
8
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment