Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
b2660e66
Unverified
Commit
b2660e66
authored
Mar 11, 2026
by
thatPepe
Committed by
GitHub
Mar 11, 2026
Browse files
Merge pull request #1070 from InfiniTensor/issue/1031_revert
Issue/1031 revert T1-1-9
parents
037140c0
45a3794b
Changes
71
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1 addition
and
1127 deletions
+1
-1127
src/infinicore/ops/asinh/asinh_infiniop.cc
src/infinicore/ops/asinh/asinh_infiniop.cc
+0
-52
src/infinicore/ops/baddbmm/baddbmm.cc
src/infinicore/ops/baddbmm/baddbmm.cc
+0
-100
src/infinicore/ops/bilinear/bilinear.cc
src/infinicore/ops/bilinear/bilinear.cc
+0
-119
src/infinicore/ops/fmod/fmod.cc
src/infinicore/ops/fmod/fmod.cc
+0
-28
src/infinicore/ops/fmod/fmod_infiniop.cc
src/infinicore/ops/fmod/fmod_infiniop.cc
+0
-52
src/infinicore/pybind11/ops.hpp
src/infinicore/pybind11/ops.hpp
+1
-12
src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
+0
-39
src/infinicore/pybind11/ops/asinh.hpp
src/infinicore/pybind11/ops/asinh.hpp
+0
-24
src/infinicore/pybind11/ops/baddbmm.hpp
src/infinicore/pybind11/ops/baddbmm.hpp
+0
-56
src/infinicore/pybind11/ops/bilinear.hpp
src/infinicore/pybind11/ops/bilinear.hpp
+0
-61
src/infinicore/pybind11/ops/fmod.hpp
src/infinicore/pybind11/ops/fmod.hpp
+0
-26
src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
+0
-47
src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
...op/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
+0
-94
src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
...iop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
+0
-8
src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
+0
-54
src/infiniop/ops/adaptive_max_pool1d/info.h
src/infiniop/ops/adaptive_max_pool1d/info.h
+0
-65
src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
...s/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
+0
-8
src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
.../adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
+0
-130
src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
...ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
+0
-8
src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
...ps/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
+0
-144
No files found.
src/infinicore/ops/asinh/asinh_infiniop.cc
deleted
100644 → 0
View file @
037140c0
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/asinh.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
asinh_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopAsinhDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopAsinhDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyAsinhDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
y
,
Tensor
x
)
{
size_t
seed
=
hash_combine
(
y
,
x
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopAsinhDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateAsinhDescriptor
(
context
::
getInfiniopHandle
(
y
->
device
()),
&
desc
,
y
->
desc
(),
x
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetAsinhWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopAsinh
(
desc
,
workspace
->
data
(),
workspace_size
,
y
->
data
(),
x
->
data
(),
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Asinh
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::asinh_impl::infiniop
src/infinicore/ops/baddbmm/baddbmm.cc
deleted
100644 → 0
View file @
037140c0
#include "infinicore/ops/baddbmm.hpp"
#include "infinicore/ops/gemm.hpp"
#include "infinicore/ops/rearrange.hpp"
namespace
infinicore
::
op
{
// 内联的 BLAS 兼容性检查,减少函数调用开销
inline
bool
is_blas_compatible
(
const
Tensor
&
t
)
{
const
auto
ndim
=
t
->
ndim
();
if
(
ndim
==
2
)
{
const
auto
rs
=
t
->
stride
(
0
);
const
auto
cs
=
t
->
stride
(
1
);
if
(
rs
!=
1
&&
cs
!=
1
)
{
return
false
;
}
if
(
rs
==
1
&&
cs
==
1
)
{
return
t
->
shape
()[
0
]
==
1
||
t
->
shape
()[
1
]
==
1
;
}
return
true
;
}
else
if
(
ndim
==
3
)
{
const
auto
rs
=
t
->
stride
(
1
);
const
auto
cs
=
t
->
stride
(
2
);
if
(
t
->
shape
()[
0
]
>
1
&&
t
->
stride
(
0
)
==
0
)
{
return
false
;
}
if
(
rs
!=
1
&&
cs
!=
1
)
{
return
false
;
}
if
(
rs
==
1
&&
cs
==
1
)
{
return
t
->
shape
()[
1
]
==
1
||
t
->
shape
()[
2
]
==
1
;
}
return
true
;
}
return
false
;
}
inline
void
prepare_gemm_input
(
Tensor
&
output
,
Tensor
&
input
,
const
size_t
batch_size
,
const
size_t
m
,
const
size_t
n
)
{
const
auto
input_ndim
=
input
->
ndim
();
if
(
input_ndim
==
2
)
{
rearrange_
(
output
,
input
->
as_strided
(
{
batch_size
,
m
,
n
},
{
0
,
input
->
stride
(
0
),
input
->
stride
(
1
)}));
}
else
if
(
input_ndim
==
3
&&
input
->
shape
()[
0
]
==
1
&&
batch_size
>
1
)
{
rearrange_
(
output
,
input
->
as_strided
(
{
batch_size
,
m
,
n
},
{
0
,
input
->
stride
(
1
),
input
->
stride
(
2
)}));
}
else
{
rearrange_
(
output
,
input
);
}
}
Tensor
baddbmm
(
Tensor
input
,
Tensor
batch1
,
Tensor
batch2
,
float
beta
,
float
alpha
)
{
const
size_t
batch_size
=
batch1
->
shape
()[
0
];
const
size_t
m
=
batch1
->
shape
()[
1
];
const
size_t
n
=
batch2
->
shape
()[
2
];
const
Tensor
&
a
=
is_blas_compatible
(
batch1
)
?
batch1
:
rearrange
(
batch1
);
const
Tensor
&
b
=
is_blas_compatible
(
batch2
)
?
batch2
:
rearrange
(
batch2
);
if
(
beta
==
0.0
f
)
{
return
gemm
(
a
,
b
,
alpha
,
0.0
f
);
}
Tensor
result
=
Tensor
::
empty
({
batch_size
,
m
,
n
},
a
->
dtype
(),
a
->
device
());
prepare_gemm_input
(
result
,
input
,
batch_size
,
m
,
n
);
gemm_
(
result
,
a
,
b
,
alpha
,
beta
);
return
result
;
}
void
baddbmm_
(
Tensor
out
,
Tensor
input
,
Tensor
batch1
,
Tensor
batch2
,
float
beta
,
float
alpha
)
{
const
size_t
batch_size
=
batch1
->
shape
()[
0
];
const
size_t
m
=
batch1
->
shape
()[
1
];
const
size_t
n
=
batch2
->
shape
()[
2
];
const
Tensor
&
a
=
is_blas_compatible
(
batch1
)
?
batch1
:
rearrange
(
batch1
);
const
Tensor
&
b
=
is_blas_compatible
(
batch2
)
?
batch2
:
rearrange
(
batch2
);
const
bool
out_is_usable
=
out
->
is_contiguous
()
&&
out
->
ndim
()
==
3
&&
out
->
shape
()[
0
]
==
batch_size
&&
out
->
shape
()[
1
]
==
m
&&
out
->
shape
()[
2
]
==
n
;
if
(
out_is_usable
)
{
if
(
beta
!=
0.0
f
&&
input
->
data
()
!=
out
->
data
())
{
prepare_gemm_input
(
out
,
input
,
batch_size
,
m
,
n
);
}
gemm_
(
out
,
a
,
b
,
alpha
,
beta
);
}
else
{
Tensor
result
=
Tensor
::
empty
({
batch_size
,
m
,
n
},
a
->
dtype
(),
a
->
device
());
if
(
beta
!=
0.0
f
)
{
prepare_gemm_input
(
result
,
input
,
batch_size
,
m
,
n
);
}
gemm_
(
result
,
a
,
b
,
alpha
,
beta
);
rearrange_
(
out
,
result
);
}
}
}
// namespace infinicore::op
src/infinicore/ops/bilinear/bilinear.cc
deleted
100644 → 0
View file @
037140c0
#include "infinicore/ops/bilinear.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/matmul.hpp"
#include "infinicore/ops/rearrange.hpp"
#ifdef ENABLE_NVIDIA_API
namespace
op
::
gemm
::
nvidia
{
void
set_tf32_enabled
(
bool
);
}
#endif
namespace
infinicore
::
op
{
namespace
{
// RAII 守卫:作用域内禁用 TF32
struct
ScopedTF32Disable
{
ScopedTF32Disable
()
{
#ifdef ENABLE_NVIDIA_API
// 实际项目中建议添加检查,仅在 NVIDIA 设备上调用
// 使用 ::op 强制从全局命名空间查找,避免被当前的 infinicore::op 遮蔽
::
op
::
gemm
::
nvidia
::
set_tf32_enabled
(
false
);
#endif
}
~
ScopedTF32Disable
()
{
#ifdef ENABLE_NVIDIA_API
::
op
::
gemm
::
nvidia
::
set_tf32_enabled
(
true
);
#endif
}
};
inline
bool
is_gemm_compatible_3d
(
const
Tensor
&
t
)
{
if
(
t
->
ndim
()
!=
3
)
{
return
false
;
}
const
auto
batch
=
t
->
shape
()[
0
];
const
auto
rows
=
t
->
shape
()[
1
];
const
auto
cols
=
t
->
shape
()[
2
];
const
auto
bs
=
t
->
stride
(
0
);
const
auto
rs
=
t
->
stride
(
1
);
const
auto
cs
=
t
->
stride
(
2
);
if
(
rs
!=
1
&&
cs
!=
1
)
{
return
false
;
}
if
(
cs
==
1
)
{
if
(
rs
<
static_cast
<
int64_t
>
(
cols
))
{
return
false
;
}
}
else
{
if
(
cs
<
static_cast
<
int64_t
>
(
rows
))
{
return
false
;
}
}
if
(
batch
>
1
&&
bs
==
0
)
{
return
false
;
}
return
true
;
}
inline
Tensor
ensure_gemm_compatible
(
const
Tensor
&
t
)
{
if
(
t
->
ndim
()
==
2
)
{
return
t
->
is_contiguous
()
?
t
:
rearrange
(
t
);
}
else
if
(
t
->
ndim
()
==
3
)
{
return
is_gemm_compatible_3d
(
t
)
?
t
:
rearrange
(
t
);
}
return
t
->
is_contiguous
()
?
t
:
rearrange
(
t
);
}
}
// anonymous namespace
Tensor
bilinear
(
Tensor
x1
,
Tensor
x2
,
Tensor
weight
,
std
::
optional
<
Tensor
>
bias
)
{
ScopedTF32Disable
tf32_guard
;
const
size_t
batch_size
=
x1
->
shape
()[
0
];
const
size_t
in1_features
=
x1
->
shape
()[
1
];
const
size_t
in2_features
=
x2
->
shape
()[
1
];
const
size_t
out_features
=
weight
->
shape
()[
0
];
Tensor
x1_compat
=
ensure_gemm_compatible
(
x1
);
Tensor
x2_compat
=
ensure_gemm_compatible
(
x2
);
Tensor
weight_cont
=
weight
->
is_contiguous
()
?
weight
:
weight
->
contiguous
();
Tensor
weight_permuted
=
weight_cont
->
permute
({
1
,
0
,
2
});
Tensor
weight_permuted_cont
=
weight_permuted
->
is_contiguous
()
?
weight_permuted
:
weight_permuted
->
contiguous
();
Tensor
weight_matrix
=
weight_permuted_cont
->
view
({
in1_features
,
out_features
*
in2_features
});
Tensor
intermediate
=
matmul
(
x1_compat
,
weight_matrix
,
1.0
f
);
Tensor
intermediate_3d
=
intermediate
->
view
({
batch_size
,
out_features
,
in2_features
});
Tensor
intermediate_transposed
=
intermediate_3d
->
permute
({
0
,
2
,
1
});
Tensor
intermediate_compat
=
ensure_gemm_compatible
(
intermediate_transposed
);
Tensor
x2_row
=
x2_compat
->
view
({
batch_size
,
1
,
in2_features
});
Tensor
x2_row_compat
=
ensure_gemm_compatible
(
x2_row
);
Tensor
out_3d
=
matmul
(
x2_row_compat
,
intermediate_compat
,
1.0
f
);
Tensor
out
=
out_3d
->
view
({
batch_size
,
out_features
});
if
(
bias
)
{
Tensor
bias_broadcast
=
(
*
bias
)
->
as_strided
(
{
batch_size
,
out_features
},
{
0
,
(
*
bias
)
->
strides
()[
0
]});
out
=
add
(
out
,
bias_broadcast
);
}
return
out
;
}
void
bilinear_
(
Tensor
out
,
Tensor
x1
,
Tensor
x2
,
Tensor
weight
,
std
::
optional
<
Tensor
>
bias
)
{
Tensor
result
=
bilinear
(
x1
,
x2
,
weight
,
bias
);
rearrange_
(
out
,
result
);
}
}
// namespace infinicore::op
src/infinicore/ops/fmod/fmod.cc
deleted
100644 → 0
View file @
037140c0
#include "infinicore/ops/fmod.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
common
::
OpDispatcher
<
Fmod
::
schema
>
&
Fmod
::
dispatcher
()
{
static
common
::
OpDispatcher
<
Fmod
::
schema
>
dispatcher_
;
return
dispatcher_
;
};
void
Fmod
::
execute
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
c
,
a
,
b
);
infinicore
::
context
::
setDevice
(
c
->
device
());
dispatcher
().
lookup
(
c
->
device
().
getType
())(
c
,
a
,
b
);
}
Tensor
fmod
(
Tensor
a
,
Tensor
b
)
{
auto
c
=
Tensor
::
empty
(
a
->
shape
(),
a
->
dtype
(),
a
->
device
());
fmod_
(
c
,
a
,
b
);
return
c
;
}
void
fmod_
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
Fmod
::
execute
(
c
,
a
,
b
);
}
}
// namespace infinicore::op
src/infinicore/ops/fmod/fmod_infiniop.cc
deleted
100644 → 0
View file @
037140c0
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/fmod.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
fmod_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopFmodDescriptor_t
>
caches
(
100
,
// capacity
[](
infiniopFmodDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyFmodDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
c
,
Tensor
a
,
Tensor
b
)
{
size_t
seed
=
hash_combine
(
c
,
b
,
a
);
auto
device_type
=
context
::
getDevice
().
getType
();
auto
device_index
=
context
::
getDevice
().
getIndex
();
auto
&
cache
=
caches
.
getCache
(
device_type
,
device_index
);
auto
desc_opt
=
cache
.
get
(
seed
);
infiniopFmodDescriptor_t
desc
=
nullptr
;
if
(
!
desc_opt
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateFmodDescriptor
(
context
::
getInfiniopHandle
(
c
->
device
()),
&
desc
,
c
->
desc
(),
a
->
desc
(),
b
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
size_t
workspace_size
=
0
;
INFINICORE_CHECK_ERROR
(
infiniopGetFmodWorkspaceSize
(
desc
,
&
workspace_size
));
std
::
shared_ptr
<
Memory
>
workspace
=
context
::
allocateMemory
(
workspace_size
);
INFINICORE_CHECK_ERROR
(
infiniopFmod
(
desc
,
workspace
->
data
(),
workspace_size
,
c
->
data
(),
a
->
data
(),
b
->
data
(),
context
::
getStream
()));
}
static
bool
registered
=
[]()
{
Fmod
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::fmod_impl::infiniop
src/infinicore/pybind11/ops.hpp
View file @
b2660e66
...
...
@@ -2,21 +2,16 @@
#include <pybind11/pybind11.h>
#include "ops/adaptive_max_pool1d.hpp"
#include "ops/add.hpp"
#include "ops/add_rms_norm.hpp"
#include "ops/all.hpp"
#include "ops/asinh.hpp"
#include "ops/attention.hpp"
#include "ops/avg_pool1d.hpp"
#include "ops/baddbmm.hpp"
#include "ops/bilinear.hpp"
#include "ops/causal_softmax.hpp"
#include "ops/cross_entropy.hpp"
#include "ops/embedding.hpp"
#include "ops/equal.hpp"
#include "ops/flash_attention.hpp"
#include "ops/fmod.hpp"
#include "ops/hardswish.hpp"
#include "ops/hardtanh.hpp"
#include "ops/kv_caching.hpp"
...
...
@@ -45,18 +40,12 @@ namespace py = pybind11;
namespace
infinicore
::
ops
{
inline
void
bind
(
py
::
module
&
m
)
{
bind_adaptive_max_pool1d
(
m
);
bind_add
(
m
);
bind_add_rms_norm
(
m
);
bind_attention
(
m
);
bind_asinh
(
m
);
bind_baddbmm
(
m
);
bind_bilinear
(
m
);
bind_causal_softmax
(
m
);
bind_flash_attention
(
m
);
bind_kv_caching
(
m
);
bind_fmod
(
m
);
bind_random_sample
(
m
);
bind_linear
(
m
);
bind_matmul
(
m
);
bind_mul
(
m
);
...
...
@@ -77,12 +66,12 @@ inline void bind(py::module &m) {
bind_embedding
(
m
);
bind_linear_w8a8i8
(
m
);
bind_silu_and_mul
(
m
);
bind_equal
(
m
);
bind_sum
(
m
);
bind_var_mean
(
m
);
bind_var
(
m
);
bind_topk
(
m
);
bind_all
(
m
);
bind_equal
(
m
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
deleted
100644 → 0
View file @
037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/adaptive_max_pool1d.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_adaptive_max_pool1d
(
py
::
module
&
m
)
{
m
.
def
(
"adaptive_max_pool1d"
,
&
op
::
adaptive_max_pool1d
,
py
::
arg
(
"x"
),
py
::
arg
(
"output_size"
),
R"doc(1D Adaptive Max Pooling.
Args:
x: Input tensor of shape (N, C, L_in) or (N, L_in)
output_size: Target output size L_out
Returns:
Output tensor of shape (N, C, L_out) or (N, L_out)
)doc"
);
m
.
def
(
"adaptive_max_pool1d_"
,
&
op
::
adaptive_max_pool1d_
,
py
::
arg
(
"y"
),
py
::
arg
(
"x"
),
py
::
arg
(
"output_size"
),
R"doc(In-place 1D Adaptive Max Pooling.
Args:
y: Output tensor of shape (N, C, L_out) or (N, L_out)
x: Input tensor of shape (N, C, L_in) or (N, L_in)
output_size: Target output size L_out
)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/asinh.hpp
deleted
100644 → 0
View file @
037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/asinh.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_asinh
(
py
::
module
&
m
)
{
m
.
def
(
"asinh"
,
&
op
::
asinh
,
py
::
arg
(
"x"
),
R"doc(Element-wise inverse hyperbolic sine function.)doc"
);
m
.
def
(
"asinh_"
,
&
op
::
asinh_
,
py
::
arg
(
"y"
),
py
::
arg
(
"x"
),
R"doc(In-place element-wise inverse hyperbolic sine function.)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/baddbmm.hpp
deleted
100644 → 0
View file @
037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/baddbmm.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_baddbmm
(
Tensor
input
,
Tensor
batch1
,
Tensor
batch2
,
float
beta
=
1.0
f
,
float
alpha
=
1.0
f
)
{
return
op
::
baddbmm
(
input
,
batch1
,
batch2
,
beta
,
alpha
);
}
void
py_baddbmm_
(
Tensor
out
,
Tensor
input
,
Tensor
batch1
,
Tensor
batch2
,
float
beta
=
1.0
f
,
float
alpha
=
1.0
f
)
{
op
::
baddbmm_
(
out
,
input
,
batch1
,
batch2
,
beta
,
alpha
);
}
inline
void
bind_baddbmm
(
py
::
module
&
m
)
{
m
.
def
(
"baddbmm"
,
&
py_baddbmm
,
py
::
arg
(
"input"
),
py
::
arg
(
"batch1"
),
py
::
arg
(
"batch2"
),
py
::
arg
(
"beta"
)
=
1.0
f
,
py
::
arg
(
"alpha"
)
=
1.0
f
,
R"doc(Batched matrix-matrix product with addition.
Args:
input: Input tensor
batch1: First batch of matrices
batch2: Second batch of matrices
beta: Scaling factor for input tensor
alpha: Scaling factor for the product of batch1 and batch2
Returns:
Output tensor after baddbmm operation
)doc"
);
m
.
def
(
"baddbmm_"
,
&
py_baddbmm_
,
py
::
arg
(
"out"
),
py
::
arg
(
"input"
),
py
::
arg
(
"batch1"
),
py
::
arg
(
"batch2"
),
py
::
arg
(
"beta"
)
=
1.0
f
,
py
::
arg
(
"alpha"
)
=
1.0
f
,
R"doc(In-place batched matrix-matrix product with addition.
Args:
out: Output tensor
input: Input tensor
batch1: First batch of matrices
batch2: Second batch of matrices
beta: Scaling factor for input tensor
alpha: Scaling factor for the product of batch1 and batch2
)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/bilinear.hpp
deleted
100644 → 0
View file @
037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/bilinear.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
Tensor
py_bilinear
(
Tensor
x1
,
Tensor
x2
,
Tensor
weight
,
pybind11
::
object
bias
)
{
std
::
optional
<
Tensor
>
bias_tensor
=
std
::
nullopt
;
if
(
!
bias
.
is_none
())
{
bias_tensor
=
bias
.
cast
<
Tensor
>
();
}
return
op
::
bilinear
(
x1
,
x2
,
weight
,
bias_tensor
);
}
void
py_bilinear_
(
Tensor
out
,
Tensor
x1
,
Tensor
x2
,
Tensor
weight
,
pybind11
::
object
bias
)
{
std
::
optional
<
Tensor
>
bias_tensor
=
std
::
nullopt
;
if
(
!
bias
.
is_none
())
{
bias_tensor
=
bias
.
cast
<
Tensor
>
();
}
op
::
bilinear_
(
out
,
x1
,
x2
,
weight
,
bias_tensor
);
}
inline
void
bind_bilinear
(
py
::
module
&
m
)
{
m
.
def
(
"bilinear"
,
&
py_bilinear
,
py
::
arg
(
"x1"
),
py
::
arg
(
"x2"
),
py
::
arg
(
"weight"
),
py
::
arg
(
"bias"
),
R"doc(Bilinear transformation of two input tensors.
Args:
x1: First input tensor
x2: Second input tensor
weight: Weight tensor
bias: Bias tensor (optional)
Returns:
Output tensor after bilinear transformation
)doc"
);
m
.
def
(
"bilinear_"
,
&
py_bilinear_
,
py
::
arg
(
"out"
),
py
::
arg
(
"x1"
),
py
::
arg
(
"x2"
),
py
::
arg
(
"weight"
),
py
::
arg
(
"bias"
),
R"doc(In-place bilinear transformation of two input tensors.
Args:
out: Output tensor
x1: First input tensor
x2: Second input tensor
weight: Weight tensor
bias: Bias tensor (optional)
)doc"
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/fmod.hpp
deleted
100644 → 0
View file @
037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/fmod.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_fmod
(
py
::
module
&
m
)
{
m
.
def
(
"fmod"
,
&
op
::
fmod
,
py
::
arg
(
"a"
),
py
::
arg
(
"b"
),
R"doc(Element-wise floating point remainder of division of two tensors.)doc"
);
m
.
def
(
"fmod_"
,
&
op
::
fmod_
,
py
::
arg
(
"c"
),
py
::
arg
(
"a"
),
py
::
arg
(
"b"
),
R"doc(In-place element-wise floating point remainder of division of two tensors.)doc"
);
}
}
// namespace infinicore::ops
src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
deleted
100644 → 0
View file @
037140c0
#ifndef ADAPTIVE_MAX_POOL1D_H
#define ADAPTIVE_MAX_POOL1D_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::adaptive_max_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AdaptiveMaxPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AdaptiveMaxPool1dInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t output_size); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // ADAPTIVE_MAX_POOL1D_H
src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
deleted
100644 → 0
View file @
037140c0
#include "adaptive_max_pool1d_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include <algorithm>
#include <cmath>
namespace
op
::
adaptive_max_pool1d
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
size_t
output_size
)
{
auto
result
=
AdaptiveMaxPool1dInfo
::
create
(
y_desc
,
x_desc
,
output_size
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
T
>
infiniStatus_t
adaptiveMaxPool1d
(
const
AdaptiveMaxPool1dInfo
*
info
,
T
*
y
,
const
T
*
x
)
{
const
size_t
ndim
=
info
->
ndim
();
const
size_t
batch_size
=
info
->
shape
[
0
];
const
size_t
channels
=
ndim
>
2
?
info
->
shape
[
1
]
:
1
;
const
size_t
input_length
=
info
->
input_length
();
const
size_t
output_length
=
info
->
output_length
();
// 计算总的任务块数 (Batch * Channels)
const
ptrdiff_t
total_blocks
=
static_cast
<
ptrdiff_t
>
(
batch_size
*
channels
);
const
ptrdiff_t
x_stride_last
=
info
->
x_strides
.
back
();
#pragma omp parallel for
for
(
ptrdiff_t
block_idx
=
0
;
block_idx
<
total_blocks
;
++
block_idx
)
{
const
size_t
i
=
block_idx
/
channels
;
// batch index
const
size_t
j
=
block_idx
%
channels
;
// channel index
const
T
*
x_ptr_base
;
T
*
y_ptr_base
;
if
(
ndim
>
2
)
{
// (N, C, L)
x_ptr_base
=
x
+
i
*
info
->
x_strides
[
0
]
+
j
*
info
->
x_strides
[
1
];
y_ptr_base
=
y
+
i
*
info
->
y_strides
[
0
]
+
j
*
info
->
y_strides
[
1
];
}
else
{
// (N, L)
x_ptr_base
=
x
+
i
*
info
->
x_strides
[
0
];
y_ptr_base
=
y
+
i
*
info
->
y_strides
[
0
];
}
for
(
size_t
out_idx
=
0
;
out_idx
<
output_length
;
++
out_idx
)
{
size_t
start_index
=
(
out_idx
*
input_length
)
/
output_length
;
size_t
end_index
=
((
out_idx
+
1
)
*
input_length
+
output_length
-
1
)
/
output_length
;
start_index
=
std
::
max
(
start_index
,
size_t
(
0
));
end_index
=
std
::
min
(
end_index
,
input_length
);
size_t
window_len
=
end_index
-
start_index
;
if
(
window_len
<=
0
)
{
continue
;
}
const
T
*
window_ptr
=
x_ptr_base
+
start_index
*
x_stride_last
;
auto
max_val
=
op
::
common_cpu
::
reduce_op
::
max
(
window_ptr
,
window_len
,
x_stride_last
);
y_ptr_base
[
out_idx
]
=
utils
::
cast
<
T
>
(
max_val
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
if
(
_info
.
atype
==
INFINI_DTYPE_F32
)
{
return
adaptiveMaxPool1d
(
&
_info
,
(
float
*
)
y
,
(
const
float
*
)
x
);
}
else
if
(
_info
.
atype
==
INFINI_DTYPE_F16
)
{
return
adaptiveMaxPool1d
(
&
_info
,
(
fp16_t
*
)
y
,
(
const
fp16_t
*
)
x
);
}
else
if
(
_info
.
atype
==
INFINI_DTYPE_BF16
)
{
return
adaptiveMaxPool1d
(
&
_info
,
(
bf16_t
*
)
y
,
(
const
bf16_t
*
)
x
);
}
else
if
(
_info
.
atype
==
INFINI_DTYPE_F64
)
{
return
adaptiveMaxPool1d
(
&
_info
,
(
double
*
)
y
,
(
const
double
*
)
x
);
}
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
// namespace op::adaptive_max_pool1d::cpu
src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
deleted
100644 → 0
View file @
037140c0
#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
#define __ADAPTIVE_MAX_POOL1D_CPU_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR
(
cpu
)
#endif
src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
deleted
100644 → 0
View file @
037140c0
#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#include <cmath>
#include <limits>
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
__device__
void
adaptiveMaxPool1dBlock
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y_batch
,
ptrdiff_t
stride_y_channel
,
const
Tdata
*
__restrict__
x
,
ptrdiff_t
stride_x_batch
,
ptrdiff_t
stride_x_channel
,
ptrdiff_t
stride_x_length
,
size_t
channels
,
size_t
input_length
,
size_t
output_length
,
size_t
ndim
)
{
size_t
block_idx
=
blockIdx
.
x
;
size_t
batch_idx
=
block_idx
/
channels
;
size_t
channel_idx
=
block_idx
%
channels
;
const
Tdata
*
x_ptr
;
Tdata
*
y_ptr
;
if
(
ndim
>
2
)
{
x_ptr
=
x
+
batch_idx
*
stride_x_batch
+
channel_idx
*
stride_x_channel
;
y_ptr
=
y
+
batch_idx
*
stride_y_batch
+
channel_idx
*
stride_y_channel
;
}
else
{
x_ptr
=
x
+
batch_idx
*
stride_x_batch
;
y_ptr
=
y
+
batch_idx
*
stride_y_batch
;
}
for
(
size_t
out_idx
=
threadIdx
.
x
;
out_idx
<
output_length
;
out_idx
+=
BLOCK_SIZE
)
{
int
start_index
=
static_cast
<
int
>
(
floorf
((
float
)
out_idx
*
input_length
/
output_length
));
int
end_index
=
static_cast
<
int
>
(
ceilf
((
float
)(
out_idx
+
1
)
*
input_length
/
output_length
));
if
(
end_index
<=
start_index
)
{
continue
;
}
Tcompute
max_val
=
Tcompute
(
x_ptr
[
start_index
*
stride_x_length
]);
for
(
int
i
=
start_index
+
1
;
i
<
end_index
;
++
i
)
{
Tcompute
val
=
Tcompute
(
x_ptr
[
i
*
stride_x_length
]);
max_val
=
max
(
max_val
,
val
);
}
y_ptr
[
out_idx
]
=
Tdata
(
max_val
);
}
}
#endif
src/infiniop/ops/adaptive_max_pool1d/info.h
deleted
100644 → 0
View file @
037140c0
#ifndef __ADAPATIVE_MAX_POOL1D_H__
#define __ADAPATIVE_MAX_POOL1D_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace
op
::
adaptive_max_pool1d
{
class
AdaptiveMaxPool1dInfo
{
AdaptiveMaxPool1dInfo
()
=
default
;
public:
infiniDtype_t
atype
;
std
::
vector
<
size_t
>
shape
;
std
::
vector
<
ptrdiff_t
>
y_strides
;
std
::
vector
<
ptrdiff_t
>
x_strides
;
size_t
input_size
;
size_t
output_size
;
size_t
ndim
()
const
{
return
shape
.
size
();
}
size_t
input_length
()
const
{
return
input_size
;
}
size_t
output_length
()
const
{
return
output_size
;
}
static
utils
::
Result
<
AdaptiveMaxPool1dInfo
>
create
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
size_t
output_size
)
{
auto
atype
=
y_desc
->
dtype
();
if
(
x_desc
->
dtype
()
!=
atype
)
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
if
(
atype
!=
INFINI_DTYPE_F16
&&
atype
!=
INFINI_DTYPE_BF16
&&
atype
!=
INFINI_DTYPE_F32
&&
atype
!=
INFINI_DTYPE_F64
)
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
const
size_t
y_ndim
=
y_desc
->
ndim
();
const
size_t
x_ndim
=
x_desc
->
ndim
();
if
(
y_ndim
!=
x_ndim
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
for
(
size_t
i
=
0
;
i
<
y_ndim
-
1
;
++
i
)
{
if
(
x_desc
->
dim
(
i
)
!=
y_desc
->
dim
(
i
))
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
}
if
(
y_desc
->
dim
(
y_ndim
-
1
)
!=
output_size
)
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
return
utils
::
Result
<
AdaptiveMaxPool1dInfo
>
(
AdaptiveMaxPool1dInfo
{
atype
,
y_desc
->
shape
(),
y_desc
->
strides
(),
x_desc
->
strides
(),
x_desc
->
dim
(
x_ndim
-
1
),
output_size
});
}
};
}
// namespace op::adaptive_max_pool1d
#endif // __ADAPATIVE_MAX_POOL1D_H__
src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
deleted
100644 → 0
View file @
037140c0
#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR
(
metax
)
#endif
src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
deleted
100644 → 0
View file @
037140c0
#include "../../../devices/metax/metax_common.h"
#include "adaptive_max_pool1d_metax.cuh"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length,ndim);
}
namespace op::adaptive_max_pool1d::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor(){
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t numblock,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
hcStream_t stream){
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>> ( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__hpcc_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto stream = reinterpret_cast<hcStream_t>(stream_);
if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::metax
src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
deleted
100644 → 0
View file @
037140c0
#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR
(
moore
)
#endif
src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
deleted
100644 → 0
View file @
037140c0
#include "../../../devices/moore/moore_common.h"
#include "adaptive_max_pool1d_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim){
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim);
}
namespace op::adaptive_max_pool1d::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t num_blocks,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
musaStream_t musa_stream) {
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__mt_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::moore
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment