Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
9ef02a16
Commit
9ef02a16
authored
Sep 15, 2025
by
Ziminli
Browse files
issue/450: remove indexToReducedOffset() in all platforms
parent
5e581b8e
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
4 additions
and
114 deletions
+4
-114
src/infiniop/binary/cpu/binary_cpu.h
src/infiniop/binary/cpu/binary_cpu.h
+4
-4
src/infiniop/devices/bang/bang_kernel_common.h
src/infiniop/devices/bang/bang_kernel_common.h
+0
-29
src/infiniop/devices/cpu/common_cpu.cc
src/infiniop/devices/cpu/common_cpu.cc
+0
-13
src/infiniop/devices/cpu/common_cpu.h
src/infiniop/devices/cpu/common_cpu.h
+0
-3
src/infiniop/devices/kunlun/kunlun_kernel_common.h
src/infiniop/devices/kunlun/kunlun_kernel_common.h
+0
-21
src/infiniop/devices/metax/metax_kernel_common.h
src/infiniop/devices/metax/metax_kernel_common.h
+0
-15
src/infiniop/devices/moore/moore_kernel_common.h
src/infiniop/devices/moore/moore_kernel_common.h
+0
-15
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
+0
-14
No files found.
src/infiniop/binary/cpu/binary_cpu.h
View file @
9ef02a16
...
...
@@ -19,8 +19,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
data_size
;
++
i
)
{
size_t
a_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
a_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
())
)
;
size_t
b_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
b_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
())
)
;
size_t
a_index
=
info
.
contiguous
?
i
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
());
size_t
b_index
=
info
.
contiguous
?
i
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
());
size_t
c_index
=
info
.
contiguous
?
i
:
(
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
c_shape
.
data
(),
info
.
c_strides
.
data
()));
c_
[
c_index
]
=
BinaryOp
{}(
a_
[
a_index
],
b_
[
b_index
],
std
::
forward
<
Args
>
(
args
)...);
...
...
@@ -37,8 +37,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
data_size
;
++
i
)
{
size_t
a_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
a_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
())
)
;
size_t
b_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
b_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
())
)
;
size_t
a_index
=
info
.
contiguous
?
i
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
());
size_t
b_index
=
info
.
contiguous
?
i
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
());
size_t
c_index
=
info
.
contiguous
?
i
:
(
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
c_shape
.
data
(),
info
.
c_strides
.
data
()));
if
constexpr
(
std
::
is_same_v
<
Tdata
,
fp16_t
>
)
{
...
...
src/infiniop/devices/bang/bang_kernel_common.h
View file @
9ef02a16
...
...
@@ -22,35 +22,6 @@ __mlu_device__ half to_half(const T &v) {
return
static_cast
<
half
>
(
v
);
}
/**
* @brief Converts a flattened index to a reduced offset considering broadcasting.
*
* This function is used when dealing with broadcasted tensors where the input
* has been broadcast to match the output shape. It calculates the offset in
* the original (non-broadcasted) tensor.
*
* @param flat_index The flattened index in the output tensor
* @param ndim Number of dimensions
* @param broadcasted_strides Strides of the broadcasted tensor
* @param target_strides Strides of the original (non-broadcasted) tensor
* @return size_t Offset in the original tensor's memory
*/
inline
__mlu_device__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
// Calculate contribution from each dimension
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
// Remove the contribution from this dimension
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
/**
* @brief Converts a flattened index to a memory offset considering tensor striding.
*
...
...
src/infiniop/devices/cpu/common_cpu.cc
View file @
9ef02a16
...
...
@@ -2,19 +2,6 @@
namespace
op
::
common_cpu
{
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
...
...
src/infiniop/devices/cpu/common_cpu.h
View file @
9ef02a16
...
...
@@ -15,9 +15,6 @@
namespace
op
::
common_cpu
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
);
// return the memory offset a tensor given flattened index
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
);
...
...
src/infiniop/devices/kunlun/kunlun_kernel_common.h
View file @
9ef02a16
...
...
@@ -105,27 +105,6 @@ inline __device__ T atomicMax(__shared_ptr__ T *ptr, T value) {
return
old
;
}
/**
* @brief Get index of broadcasted input
* flat_index: flatten index of output tensor
* ndim: dim of output tensor
* broadcasted_strides: strides of output tensor
* target_strides: strides of input tensor
*/
inline
__device__
int
indexToReducedOffset
(
int
flat_index
,
// output flatten index
int
ndim
,
// output dims
const
_ptrdiff_t
*
broadcasted_strides
,
// output strides
const
_ptrdiff_t
*
target_strides
)
{
// strides of inputs
int
res
=
0
;
for
(
int
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
].
value
*
target_strides
[
i
].
value
;
flat_index
%=
broadcasted_strides
[
i
].
value
;
}
return
res
;
}
/**
* @brief Get real offset of input index
* flat_index: flatten index input
...
...
src/infiniop/devices/metax/metax_kernel_common.h
View file @
9ef02a16
...
...
@@ -12,21 +12,6 @@ using cuda_bfloat162 = hpcc_bfloat162;
namespace
device
::
metax
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
...
...
src/infiniop/devices/moore/moore_kernel_common.h
View file @
9ef02a16
...
...
@@ -16,21 +16,6 @@ using cuda_bfloat162 = mt_bfloat162;
namespace
device
::
moore
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
indexToOffset
(
...
...
src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
View file @
9ef02a16
...
...
@@ -19,20 +19,6 @@ using cuda_bfloat16 = nv_bfloat16;
using
cuda_bfloat162
=
nv_bfloat162
;
namespace
device
::
nvidia
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__
__device__
__host__
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
res
+=
flat_index
/
broadcasted_strides
[
i
]
*
target_strides
[
i
];
flat_index
%=
broadcasted_strides
[
i
];
}
return
res
;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__
__device__
__host__
size_t
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment