Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
210e31d3
Commit
210e31d3
authored
Mar 09, 2026
by
PanZezhong
Browse files
issue/1031 T1-1-4
parent
7f295448
Changes
98
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1143 additions
and
0 deletions
+1143
-0
src/infinicore/ops/sum/sum_infiniop.cc
src/infinicore/ops/sum/sum_infiniop.cc
+57
-0
src/infinicore/ops/topk/topk.cc
src/infinicore/ops/topk/topk.cc
+40
-0
src/infinicore/ops/topk/topk_infiniop.cc
src/infinicore/ops/topk/topk_infiniop.cc
+57
-0
src/infinicore/ops/var/var.cc
src/infinicore/ops/var/var.cc
+68
-0
src/infinicore/ops/var/var_infiniop.cc
src/infinicore/ops/var/var_infiniop.cc
+57
-0
src/infinicore/ops/var_mean/var_mean.cc
src/infinicore/ops/var_mean/var_mean.cc
+69
-0
src/infinicore/ops/var_mean/var_mean_infiniop.cc
src/infinicore/ops/var_mean/var_mean_infiniop.cc
+59
-0
src/infinicore/pybind11/ops.hpp
src/infinicore/pybind11/ops.hpp
+10
-0
src/infinicore/pybind11/ops/all.hpp
src/infinicore/pybind11/ops/all.hpp
+60
-0
src/infinicore/pybind11/ops/sum.hpp
src/infinicore/pybind11/ops/sum.hpp
+60
-0
src/infinicore/pybind11/ops/topk.hpp
src/infinicore/pybind11/ops/topk.hpp
+54
-0
src/infinicore/pybind11/ops/var.hpp
src/infinicore/pybind11/ops/var.hpp
+62
-0
src/infinicore/pybind11/ops/var_mean.hpp
src/infinicore/pybind11/ops/var_mean.hpp
+63
-0
src/infiniop/ops/all/all_desc.h
src/infiniop/ops/all/all_desc.h
+53
-0
src/infiniop/ops/all/cpu/all_cpu.cc
src/infiniop/ops/all/cpu/all_cpu.cc
+77
-0
src/infiniop/ops/all/cpu/all_cpu.h
src/infiniop/ops/all/cpu/all_cpu.h
+8
-0
src/infiniop/ops/all/cuda/kernel.cuh
src/infiniop/ops/all/cuda/kernel.cuh
+98
-0
src/infiniop/ops/all/info.h
src/infiniop/ops/all/info.h
+66
-0
src/infiniop/ops/all/metax/all_metax.h
src/infiniop/ops/all/metax/all_metax.h
+8
-0
src/infiniop/ops/all/metax/all_metax.maca
src/infiniop/ops/all/metax/all_metax.maca
+117
-0
No files found.
src/infinicore/ops/sum/sum_infiniop.cc
0 → 100644
View file @
210e31d3
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/sum.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
sum_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopSumDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopSumDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroySumDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
keepdim
)
{
size_t
seed
=
hash_combine
(
output
,
input
,
dim
.
size
(),
keepdim
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopSumDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateSumDescriptor
(
context
::
getInfiniopHandle
(
output
->
device
()),
&
desc
,
output
->
desc
(),
input
->
desc
(),
dim
.
data
(),
dim
.
size
(),
keepdim
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetSumWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopSum
(
desc
,
workspace
->
data
(),
workspace_size
,
output
->
data
(),
input
->
data
(),
dim
.
data
(),
dim
.
size
(),
keepdim
,
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Sum
::
dispatcher
().
registerDevice
({
Device
::
Type
::
CPU
,
Device
::
Type
::
NVIDIA
,
Device
::
Type
::
METAX
,
Device
::
Type
::
MOORE
,
Device
::
Type
::
ILUVATAR
},
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::sum_impl::infiniop
src/infinicore/ops/topk/topk.cc
0 → 100644
View file @
210e31d3
#include "infinicore/ops/topk.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
TopK
::
schema
>
&
TopK
::
dispatcher
()
{
static
common
::
OpDispatcher
<
TopK
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
TopK
::
execute
(
Tensor
values_output
,
Tensor
indices_output
,
Tensor
input
,
size_t
k
,
size_t
dim
,
bool
largest
,
bool
sorted
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
values_output
,
input
);
infinicore
::
context
::
setDevice
(
input
->
device
());
auto
device_type
=
context
::
getDevice
().
getType
();
auto
func
=
dispatcher
().
lookup
(
device_type
);
if
(
func
==
nullptr
)
{
throw
std
::
runtime_error
(
"No Topk implementation found for device type: "
+
std
::
to_string
(
static_cast
<
int
>
(
device_type
)));
}
func
(
values_output
,
indices_output
,
input
,
k
,
dim
,
largest
,
sorted
);
}
std
::
pair
<
Tensor
,
Tensor
>
topk
(
Tensor
input
,
size_t
k
,
size_t
dim
,
bool
largest
,
bool
sorted
)
{
auto
in_shape
=
input
->
shape
();
std
::
vector
<
size_t
>
out_shape
=
in_shape
;
out_shape
[
dim
]
=
k
;
auto
values_output
=
Tensor
::
empty
(
out_shape
,
input
->
dtype
(),
input
->
device
());
auto
indices_output
=
Tensor
::
empty
(
out_shape
,
DataType
::
I32
,
input
->
device
());
topk_
(
values_output
,
indices_output
,
input
,
k
,
dim
,
largest
,
sorted
);
return
{
values_output
,
indices_output
};
}
void
topk_
(
Tensor
values_output
,
Tensor
indices_output
,
Tensor
input
,
size_t
k
,
size_t
dim
,
bool
largest
,
bool
sorted
)
{
TopK
::
execute
(
values_output
,
indices_output
,
input
,
k
,
dim
,
largest
,
sorted
);
}
}
// namespace infinicore::op
src/infinicore/ops/topk/topk_infiniop.cc
0 → 100644
View file @
210e31d3
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/topk.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
topk_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopTopKDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopTopKDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyTopKDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
values_output
,
Tensor
indices_output
,
Tensor
input
,
size_t
k
,
size_t
dim
,
bool
largest
,
bool
sorted
)
{
size_t
seed
=
hash_combine
(
values_output
,
indices_output
,
input
,
k
,
dim
,
largest
,
sorted
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopTopKDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateTopKDescriptor
(
context
::
getInfiniopHandle
(
values_output
->
device
()),
&
desc
,
values_output
->
desc
(),
indices_output
->
desc
(),
input
->
desc
(),
k
,
dim
,
largest
,
sorted
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetTopKWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopTopK
(
desc
,
workspace
->
data
(),
workspace_size
,
values_output
->
data
(),
indices_output
->
data
(),
input
->
data
(),
k
,
dim
,
largest
,
sorted
,
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
TopK
::
dispatcher
().
registerDevice
({
Device
::
Type
::
CPU
,
Device
::
Type
::
NVIDIA
,
Device
::
Type
::
METAX
,
Device
::
Type
::
MOORE
,
Device
::
Type
::
ILUVATAR
},
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::topk_impl::infiniop
src/infinicore/ops/var/var.cc
0 → 100644
View file @
210e31d3
#include "infinicore/ops/var.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Var
::
schema
>
&
Var
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Var
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
Var
::
execute
(
Tensor
var_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
var_output
,
input
);
infinicore
::
context
::
setDevice
(
input
->
device
());
auto
device_type
=
context
::
getDevice
().
getType
();
auto
func
=
dispatcher
().
lookup
(
device_type
);
if
(
func
==
nullptr
)
{
throw
std
::
runtime_error
(
"No Var implementation found for device type: "
+
std
::
to_string
(
static_cast
<
int
>
(
device_type
)));
}
func
(
var_output
,
input
,
dim
,
unbiased
,
keepdim
);
}
Tensor
var
(
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
auto
in_shape
=
input
->
shape
();
std
::
vector
<
size_t
>
out_shape
;
if
(
dim
.
empty
())
{
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
dim
.
push_back
(
i
);
}
}
std
::
sort
(
dim
.
begin
(),
dim
.
end
());
if
(
dim
.
size
()
==
in_shape
.
size
()
&&
!
keepdim
)
{
out_shape
=
{};
}
else
{
if
(
keepdim
)
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
if
(
j
<
dim
.
size
()
&&
dim
[
j
]
==
i
)
{
out_shape
.
push_back
(
1
);
j
++
;
}
else
{
out_shape
.
push_back
(
in_shape
[
i
]);
}
}
}
else
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
if
(
j
<
dim
.
size
()
&&
dim
[
j
]
==
i
)
{
j
++
;
}
else
{
out_shape
.
push_back
(
in_shape
[
i
]);
}
}
}
}
auto
var_output
=
Tensor
::
empty
(
out_shape
,
input
->
dtype
(),
input
->
device
());
var_
(
var_output
,
input
,
dim
,
unbiased
,
keepdim
);
return
var_output
;
}
void
var_
(
Tensor
var_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
Var
::
execute
(
var_output
,
input
,
dim
,
unbiased
,
keepdim
);
}
}
// namespace infinicore::op
src/infinicore/ops/var/var_infiniop.cc
0 → 100644
View file @
210e31d3
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/var.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
var_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopVarDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopVarDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyVarDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
var_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
size_t
seed
=
hash_combine
(
var_output
,
input
,
dim
.
size
(),
unbiased
,
keepdim
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopVarDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateVarDescriptor
(
context
::
getInfiniopHandle
(
var_output
->
device
()),
&
desc
,
var_output
->
desc
(),
input
->
desc
(),
dim
.
data
(),
dim
.
size
(),
unbiased
,
keepdim
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetVarWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopVar
(
desc
,
workspace
->
data
(),
workspace_size
,
var_output
->
data
(),
input
->
data
(),
dim
.
data
(),
dim
.
size
(),
unbiased
,
keepdim
,
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Var
::
dispatcher
().
registerDevice
({
Device
::
Type
::
CPU
,
Device
::
Type
::
NVIDIA
,
Device
::
Type
::
METAX
,
Device
::
Type
::
MOORE
,
Device
::
Type
::
ILUVATAR
},
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::var_impl::infiniop
src/infinicore/ops/var_mean/var_mean.cc
0 → 100644
View file @
210e31d3
#include "infinicore/ops/var_mean.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Var_Mean
::
schema
>
&
Var_Mean
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Var_Mean
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
Var_Mean
::
execute
(
Tensor
var_output
,
Tensor
mean_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
var_output
,
mean_output
,
input
);
infinicore
::
context
::
setDevice
(
input
->
device
());
auto
device_type
=
context
::
getDevice
().
getType
();
auto
func
=
dispatcher
().
lookup
(
device_type
);
if
(
func
==
nullptr
)
{
throw
std
::
runtime_error
(
"No Var_Mean implementation found for device type: "
+
std
::
to_string
(
static_cast
<
int
>
(
device_type
)));
}
func
(
var_output
,
mean_output
,
input
,
dim
,
unbiased
,
keepdim
);
}
std
::
pair
<
Tensor
,
Tensor
>
var_mean
(
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
auto
in_shape
=
input
->
shape
();
std
::
vector
<
size_t
>
out_shape
;
if
(
dim
.
empty
())
{
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
dim
.
push_back
(
i
);
}
}
std
::
sort
(
dim
.
begin
(),
dim
.
end
());
if
(
dim
.
size
()
==
in_shape
.
size
()
&&
!
keepdim
)
{
out_shape
=
{};
}
else
{
if
(
keepdim
)
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
if
(
j
<
dim
.
size
()
&&
dim
[
j
]
==
i
)
{
out_shape
.
push_back
(
1
);
j
++
;
}
else
{
out_shape
.
push_back
(
in_shape
[
i
]);
}
}
}
else
{
size_t
j
=
0
;
for
(
size_t
i
=
0
;
i
<
in_shape
.
size
();
i
++
)
{
if
(
j
<
dim
.
size
()
&&
dim
[
j
]
==
i
)
{
j
++
;
}
else
{
out_shape
.
push_back
(
in_shape
[
i
]);
}
}
}
}
auto
var_output
=
Tensor
::
empty
(
out_shape
,
input
->
dtype
(),
input
->
device
());
auto
mean_output
=
Tensor
::
empty
(
out_shape
,
input
->
dtype
(),
input
->
device
());
var_mean_
(
var_output
,
mean_output
,
input
,
dim
,
unbiased
,
keepdim
);
return
{
var_output
,
mean_output
};
}
void
var_mean_
(
Tensor
var_output
,
Tensor
mean_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
Var_Mean
::
execute
(
var_output
,
mean_output
,
input
,
dim
,
unbiased
,
keepdim
);
}
}
// namespace infinicore::op
src/infinicore/ops/var_mean/var_mean_infiniop.cc
0 → 100644
View file @
210e31d3
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/var_mean.hpp"
#include <infiniop.h>
// todo 实现需要修改calculate函数
namespace
infinicore
::
op
::
var_mean_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopVarMeanDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopVarMeanDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyVarMeanDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
var_output
,
Tensor
mean_output
,
Tensor
input
,
std
::
vector
<
size_t
>
dim
,
bool
unbiased
,
bool
keepdim
)
{
size_t
seed
=
hash_combine
(
var_output
,
mean_output
,
input
,
dim
.
size
(),
unbiased
,
keepdim
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopVarMeanDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateVarMeanDescriptor
(
context
::
getInfiniopHandle
(
var_output
->
device
()),
&
desc
,
var_output
->
desc
(),
mean_output
->
desc
(),
input
->
desc
(),
dim
.
data
(),
dim
.
size
(),
unbiased
,
keepdim
));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetVarMeanWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopVarMean
(
desc
,
workspace
->
data
(),
workspace_size
,
var_output
->
data
(),
mean_output
->
data
(),
input
->
data
(),
dim
.
data
(),
dim
.
size
(),
unbiased
,
keepdim
,
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Var_Mean
::
dispatcher
().
registerDevice
({
Device
::
Type
::
CPU
,
Device
::
Type
::
NVIDIA
,
Device
::
Type
::
METAX
,
Device
::
Type
::
MOORE
,
Device
::
Type
::
ILUVATAR
},
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::var_mean_impl::infiniop
src/infinicore/pybind11/ops.hpp
View file @
210e31d3
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "ops/adaptive_max_pool1d.hpp"
#include "ops/adaptive_max_pool1d.hpp"
#include "ops/add.hpp"
#include "ops/add.hpp"
#include "ops/add_rms_norm.hpp"
#include "ops/add_rms_norm.hpp"
#include "ops/all.hpp"
#include "ops/asinh.hpp"
#include "ops/asinh.hpp"
#include "ops/attention.hpp"
#include "ops/attention.hpp"
#include "ops/avg_pool1d.hpp"
#include "ops/avg_pool1d.hpp"
...
@@ -33,7 +34,11 @@
...
@@ -33,7 +34,11 @@
#include "ops/rope.hpp"
#include "ops/rope.hpp"
#include "ops/silu.hpp"
#include "ops/silu.hpp"
#include "ops/silu_and_mul.hpp"
#include "ops/silu_and_mul.hpp"
#include "ops/sum.hpp"
#include "ops/swiglu.hpp"
#include "ops/swiglu.hpp"
#include "ops/topk.hpp"
#include "ops/var.hpp"
#include "ops/var_mean.hpp"
namespace
py
=
pybind11
;
namespace
py
=
pybind11
;
...
@@ -73,6 +78,11 @@ inline void bind(py::module &m) {
...
@@ -73,6 +78,11 @@ inline void bind(py::module &m) {
bind_linear_w8a8i8
(
m
);
bind_linear_w8a8i8
(
m
);
bind_silu_and_mul
(
m
);
bind_silu_and_mul
(
m
);
bind_equal
(
m
);
bind_equal
(
m
);
bind_sum
(
m
);
bind_var_mean
(
m
);
bind_var
(
m
);
bind_topk
(
m
);
bind_all
(
m
);
}
}
}
// namespace infinicore::ops
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/all.hpp
0 → 100644
View file @
210e31d3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/all.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_all
(
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
all
(
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
all
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
all
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_all_
(
Tensor
output
,
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
all_
(
output
,
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
all_
(
output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
all_
(
output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
inline
void
bind_all
(
py
::
module
&
m
)
{
m
.
def
(
"all"
,
&
py_all
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(All of input tensor along the given dimensions.)doc"
);
m
.
def
(
"all_"
,
&
py_all_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor all.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/sum.hpp
0 → 100644
View file @
210e31d3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/sum.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_sum
(
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
sum
(
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
sum
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
sum
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_sum_
(
Tensor
output
,
Tensor
input
,
py
::
object
dim
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
sum_
(
output
,
input
,
dim_vec
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
sum_
(
output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
sum_
(
output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
inline
void
bind_sum
(
py
::
module
&
m
)
{
m
.
def
(
"sum"
,
&
py_sum
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(Sum of input tensor along the given dimensions.)doc"
);
m
.
def
(
"sum_"
,
&
py_sum_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor sum.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/topk.hpp
0 → 100644
View file @
210e31d3
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // 添加这行
#include "infinicore/ops/topk.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
std
::
pair
<
Tensor
,
Tensor
>
py_topk
(
Tensor
input
,
size_t
k
,
int
dim
,
bool
largest
,
bool
sorted
)
{
if
(
dim
==
-
1
)
{
return
op
::
topk
(
input
,
k
,
input
->
ndim
()
-
1
,
largest
,
sorted
);
}
else
if
(
dim
>=
0
)
{
return
op
::
topk
(
input
,
k
,
static_cast
<
size_t
>
(
dim
),
largest
,
sorted
);
}
else
{
throw
std
::
invalid_argument
(
"invalid argument: dim"
);
}
}
void
py_topk_
(
Tensor
values_output
,
Tensor
indices_output
,
Tensor
input
,
size_t
k
,
int
dim
,
bool
largest
,
bool
sorted
)
{
if
(
dim
==
-
1
)
{
op
::
topk_
(
values_output
,
indices_output
,
input
,
k
,
input
->
ndim
()
-
1
,
largest
,
sorted
);
}
else
if
(
dim
>=
0
)
{
op
::
topk_
(
values_output
,
indices_output
,
input
,
k
,
static_cast
<
size_t
>
(
dim
),
largest
,
sorted
);
}
else
{
throw
std
::
invalid_argument
(
"invalid argument: dim"
);
}
}
inline
void
bind_topk
(
py
::
module
&
m
)
{
m
.
def
(
"topk"
,
&
py_topk
,
py
::
arg
(
"input"
),
py
::
arg
(
"k"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"largest"
),
py
::
arg
(
"sorted"
),
R"doc(topk of input tensor along the given dimensions.)doc"
);
m
.
def
(
"topk_"
,
&
py_topk_
,
py
::
arg
(
"values_output"
),
py
::
arg
(
"indices_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"k"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"largest"
),
py
::
arg
(
"sorted"
),
R"doc(In-place tensor topk_.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/var.hpp
0 → 100644
View file @
210e31d3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_var
(
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
var
(
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
var
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
var
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_var_
(
Tensor
var_output
,
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
var_
(
var_output
,
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
var_
(
var_output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
var_
(
var_output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a list/tuple or an integer"
);
}
}
inline
void
bind_var
(
py
::
module
&
m
)
{
m
.
def
(
"var"
,
&
py_var
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(Var of input tensor along the given dimensions.)doc"
);
m
.
def
(
"var_"
,
&
py_var_
,
py
::
arg
(
"var_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor Var .)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/var_mean.hpp
0 → 100644
View file @
210e31d3
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var_mean.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
std
::
pair
<
Tensor
,
Tensor
>
py_var_mean
(
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
return
op
::
var_mean
(
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
return
op
::
var_mean
(
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
return
op
::
var_mean
(
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a tuple or an integer"
);
}
}
void
py_var_mean_
(
Tensor
var_output
,
Tensor
mean_output
,
Tensor
input
,
py
::
object
dim
,
bool
unbiased
,
bool
keepdim
)
{
if
(
dim
.
is_none
())
{
std
::
vector
<
size_t
>
dim_vec
;
for
(
int
i
=
0
;
i
<
input
->
shape
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
dim_vec
,
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
tuple
>
(
dim
)
||
py
::
isinstance
<
py
::
list
>
(
dim
))
{
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
dim
.
cast
<
std
::
vector
<
size_t
>>
(),
unbiased
,
keepdim
);
}
else
if
(
py
::
isinstance
<
py
::
int_
>
(
dim
))
{
op
::
var_mean_
(
var_output
,
mean_output
,
input
,
std
::
vector
<
size_t
>
(
1
,
dim
.
cast
<
size_t
>
()),
unbiased
,
keepdim
);
}
else
{
throw
std
::
invalid_argument
(
"dim must be a list/tuple or an integer"
);
}
}
inline
void
bind_var_mean
(
py
::
module
&
m
)
{
m
.
def
(
"var_mean"
,
&
py_var_mean
,
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(Var & Mean of input tensor along the given dimensions.)doc"
);
m
.
def
(
"var_mean_"
,
&
py_var_mean_
,
py
::
arg
(
"var_output"
),
py
::
arg
(
"mean_output"
),
py
::
arg
(
"input"
),
py
::
arg
(
"dim"
),
py
::
arg
(
"unbiased"
),
py
::
arg
(
"keepdim"
),
R"doc(In-place tensor Var & Mean .)doc"
);
}
}
// namespace infinicore::ops
src/infiniop/ops/all/all_desc.h
0 → 100644
View file @
210e31d3
#ifndef INFINIOP_ALL_DESCRIPTOR_H_
#define INFINIOP_ALL_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::all::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AllInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AllInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
size_t *dim, \
size_t dim_size, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
src/infiniop/ops/all/cpu/all_cpu.cc
0 → 100644
View file @
210e31d3
#include "all_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include <iostream>
namespace
op
::
all
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
result
=
AllInfo
::
create
(
output_desc
,
input_desc
,
dim
,
dim_size
,
keepdim
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
typename
Tdata
>
infiniStatus_t
calculateAll
(
const
AllInfo
&
info
,
bool
*
output
,
const
Tdata
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
if
(
info
.
reduce_dim_size
==
info
.
ndim
)
{
bool
result
=
true
;
for
(
size_t
index
=
0
;
index
<
info
.
input_size
;
index
++
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
index
,
info
.
ndim
,
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
result
=
result
&&
input
[
input_offset
];
}
output
[
0
]
=
result
;
return
INFINI_STATUS_SUCCESS
;
}
else
{
for
(
size_t
i
=
info
.
output_size
;
i
--
>
0
;)
{
size_t
output_offset
=
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
output_shape
.
size
(),
info
.
output_shape
.
data
(),
info
.
output_strides
.
data
());
bool
result
=
true
;
for
(
size_t
j
=
0
;
j
<
info
.
reduce_num
;
j
++
)
{
size_t
input_flat
=
j
+
i
*
info
.
reduce_num
;
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
input_flat
,
info
.
ndim
,
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
Tdata
input_val
=
input
[
input_offset
];
bool
bool_val
=
static_cast
<
bool
>
(
input_val
);
result
=
result
&&
bool_val
;
}
output
[
output_offset
]
=
result
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_BOOL
:
return
calculateAll
<
bool
>
(
_info
,
reinterpret_cast
<
bool
*>
(
output
),
reinterpret_cast
<
const
bool
*>
(
input
),
dim
,
dim_size
,
keepdim
);
case
INFINI_DTYPE_U8
:
return
calculateAll
<
uint8_t
>
(
_info
,
reinterpret_cast
<
bool
*>
(
output
),
reinterpret_cast
<
const
uint8_t
*>
(
input
),
dim
,
dim_size
,
keepdim
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::all::cpu
src/infiniop/ops/all/cpu/all_cpu.h
0 → 100644
View file @
210e31d3
#ifndef __INFINIOP_ALL_CPU_H__
#define __INFINIOP_ALL_CPU_H__
#include "../all_desc.h"
DESCRIPTOR
(
cpu
);
#endif // __INFINIOP_ALL_CPU_H__
src/infiniop/ops/all/cuda/kernel.cuh
0 → 100644
View file @
210e31d3
#ifndef __ALL_CUDA_H__
#define __ALL_CUDA_H__
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
>
__global__
void
allReduceTempKernel
(
bool
*
temp_output
,
const
Tdata
*
input
,
size_t
input_size
,
size_t
permuted_input_shape_size
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
)
{
__shared__
bool
s_data
[
BLOCK_SIZE
];
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
input_size
)
{
size_t
input_offset
=
indexToOffset
(
idx
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
s_data
[
tid
]
=
static_cast
<
bool
>
(
input
[
input_offset
]);
}
else
{
s_data
[
tid
]
=
true
;
}
__syncthreads
();
for
(
size_t
s
=
blockDim
.
x
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
s_data
[
tid
]
=
s_data
[
tid
]
&&
s_data
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
temp_output
[
blockIdx
.
x
]
=
s_data
[
0
];
}
}
template
<
size_t
BLOCK_SIZE
>
__global__
void
finalAllReduceKernel
(
bool
*
output
,
const
bool
*
block_results
,
size_t
num_blocks
)
{
__shared__
bool
s_data
[
BLOCK_SIZE
];
size_t
tid
=
threadIdx
.
x
;
bool
thread_val
=
true
;
for
(
size_t
i
=
tid
;
i
<
num_blocks
;
i
+=
blockDim
.
x
)
{
thread_val
=
thread_val
&&
block_results
[
i
];
}
s_data
[
tid
]
=
thread_val
;
__syncthreads
();
for
(
size_t
s
=
BLOCK_SIZE
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
s_data
[
tid
]
=
s_data
[
tid
]
&&
s_data
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
output
=
s_data
[
0
];
}
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
>
__global__
void
allKernel
(
bool
*
output
,
const
Tdata
*
input
,
size_t
permuted_input_shape_size
,
size_t
output_shape_size
,
size_t
output_size
,
size_t
reduce_num
,
size_t
*
permuted_input_shape
,
size_t
*
output_shape
,
ptrdiff_t
*
permuted_input_strides
,
ptrdiff_t
*
output_strides
)
{
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
>=
output_size
)
{
return
;
}
size_t
output_index
=
indexToOffset
(
idx
,
output_shape_size
,
output_shape
,
output_strides
);
bool
tempRes
=
true
;
for
(
size_t
i
=
0
;
i
<
reduce_num
;
i
++
)
{
size_t
input_offset
=
indexToOffset
(
i
+
idx
*
reduce_num
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
tempRes
=
tempRes
&&
static_cast
<
bool
>
(
input
[
input_offset
]);
}
output
[
output_index
]
=
tempRes
;
}
#endif // __ALL_CUDA_H__
src/infiniop/ops/all/info.h
0 → 100644
View file @
210e31d3
#ifndef __ALL_INFO_H__
#define __ALL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace
op
::
all
{
class
AllInfo
{
AllInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
permuted_input_shape
;
// need to permute
std
::
vector
<
size_t
>
output_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
// need to permute
std
::
vector
<
ptrdiff_t
>
output_strides
;
size_t
reduce_dim_size
;
// reduce dim size
size_t
reduce_num
;
// number of elements to reduce for each output element
size_t
input_size
;
// total number of input elements
size_t
output_size
;
// total number of output elements
size_t
ndim
;
// number of dimensions
static
utils
::
Result
<
AllInfo
>
create
(
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
input_strides
=
input_desc
->
strides
();
size_t
input_ndim
=
input_desc
->
ndim
();
size_t
reduce_num
=
1
;
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
reduce_num
*=
input_shape
[
dim
[
i
]];
}
std
::
vector
<
size_t
>
permute_order
;
for
(
size_t
i
=
0
;
i
<
input_ndim
;
i
++
)
{
if
(
std
::
find
(
dim
,
dim
+
dim_size
,
i
)
==
dim
+
dim_size
)
{
permute_order
.
push_back
(
i
);
}
}
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
permute_order
.
push_back
(
dim
[
i
]);
}
std
::
vector
<
size_t
>
permuted_input_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
for
(
size_t
i
=
0
;
i
<
permute_order
.
size
();
i
++
)
{
permuted_input_shape
.
push_back
(
input_shape
[
permute_order
[
i
]]);
permuted_input_strides
.
push_back
(
input_strides
[
permute_order
[
i
]]);
}
return
utils
::
Result
<
AllInfo
>
(
AllInfo
{
input_desc
->
dtype
(),
permuted_input_shape
,
output_desc
->
shape
(),
permuted_input_strides
,
output_desc
->
strides
(),
dim_size
,
reduce_num
,
input_desc
->
numel
(),
output_desc
->
numel
(),
input_ndim
});
}
};
}
// namespace op::all
#endif
src/infiniop/ops/all/metax/all_metax.h
0 → 100644
View file @
210e31d3
#ifndef __ALL_METAX_H__
#define __ALL_METAX_H__
#include "../all_desc.h"
DESCRIPTOR
(
metax
);
#endif
src/infiniop/ops/all/metax/all_metax.maca
0 → 100644
View file @
210e31d3
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_metax.h"
namespace op::all::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_METAX(hcFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::metax
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment