issue/450: remove indexToReducedOffset() in all platforms

9ef02a16 · Ziminli · 5e581b8e · 9ef02a16 · 9ef02a16 · 9ef02a16
Commit 9ef02a16 authored Sep 15, 2025 by Ziminli
8 changed files
--- a/src/infiniop/binary/cpu/binary_cpu.h
+++ b/src/infiniop/binary/cpu/binary_cpu.h
@@ -19,8 +19,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *

 #pragma omp parallel for
    for (ptrdiff_t i = 0; i < data_size; ++i) {
-        size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
-        size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
+        size_t a_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data());
+        size_t b_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data());
        size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));

        c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
@@ -37,8 +37,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *

 #pragma omp parallel for
    for (ptrdiff_t i = 0; i < data_size; ++i) {
-        size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
-        size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
+        size_t a_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data());
+        size_t b_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data());
        size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));

        if constexpr (std::is_same_v<Tdata, fp16_t>) {

--- a/src/infiniop/devices/bang/bang_kernel_common.h
+++ b/src/infiniop/devices/bang/bang_kernel_common.h
@@ -22,35 +22,6 @@ __mlu_device__ half to_half(const T &v) {
    return static_cast<half>(v);
 }

-/**
- * @brief Converts a flattened index to a reduced offset considering broadcasting.
- *
- * This function is used when dealing with broadcasted tensors where the input
- * has been broadcast to match the output shape. It calculates the offset in
- * the original (non-broadcasted) tensor.
- *
- * @param flat_index The flattened index in the output tensor
- * @param ndim Number of dimensions
- * @param broadcasted_strides Strides of the broadcasted tensor
- * @param target_strides Strides of the original (non-broadcasted) tensor
- * @return size_t Offset in the original tensor's memory
- */
-inline __mlu_device__ size_t indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        // Calculate contribution from each dimension
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        // Remove the contribution from this dimension
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-
 /**
 * @brief Converts a flattened index to a memory offset considering tensor striding.
 *

--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
@@ -2,19 +2,6 @@

 namespace op::common_cpu {

-size_t indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-
 size_t indexToOffset(
    size_t flat_index,
    size_t ndim,

--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
@@ -15,9 +15,6 @@

 namespace op::common_cpu {

-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);
-
 // return the memory offset a tensor given flattened index
 size_t indexToOffset(size_t flat_index, size_t ndim, const size_t *shape, const ptrdiff_t *strides);


--- a/src/infiniop/devices/kunlun/kunlun_kernel_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_kernel_common.h
@@ -105,27 +105,6 @@ inline __device__ T atomicMax(__shared_ptr__ T *ptr, T value) {
    return old;
 }

-/**
- * @brief Get index of broadcasted input
- * flat_index: flatten index of output tensor
- * ndim: dim of output tensor
- * broadcasted_strides: strides of output tensor
- * target_strides: strides of input tensor
- */
-inline __device__ int indexToReducedOffset(
-    int flat_index,                        // output flatten index
-    int ndim,                              // output dims
-    const _ptrdiff_t *broadcasted_strides, // output strides
-    const _ptrdiff_t *target_strides) {    // strides of inputs
-
-    int res = 0;
-    for (int i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i].value * target_strides[i].value;
-        flat_index %= broadcasted_strides[i].value;
-    }
-    return res;
-}
-
 /**
 * @brief Get real offset of input index
 * flat_index: flatten index input

--- a/src/infiniop/devices/metax/metax_kernel_common.h
+++ b/src/infiniop/devices/metax/metax_kernel_common.h
@@ -12,21 +12,6 @@ using cuda_bfloat162 = hpcc_bfloat162;

 namespace device::metax {

-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-
 // get the memory offset of the given element in a tensor given its flat index
 __forceinline__ __device__ __host__ size_t
 indexToOffset(

--- a/src/infiniop/devices/moore/moore_kernel_common.h
+++ b/src/infiniop/devices/moore/moore_kernel_common.h
@@ -16,21 +16,6 @@ using cuda_bfloat162 = mt_bfloat162;

 namespace device::moore {

-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-
 // get the memory offset of the given element in a tensor given its flat index
 __forceinline__ __device__ __host__ size_t
 indexToOffset(

--- a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
+++ b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
@@ -19,20 +19,6 @@ using cuda_bfloat16 = nv_bfloat16;
 using cuda_bfloat162 = nv_bfloat162;

 namespace device::nvidia {
-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}

 // get the memory offset of the given element in a tensor given its flat index
 __forceinline__ __device__ __host__ size_t