Commit 9ef02a16 authored by Ziminli's avatar Ziminli
Browse files

issue/450: remove indexToReducedOffset() in all platforms

parent 5e581b8e
......@@ -19,8 +19,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *
#pragma omp parallel for
for (ptrdiff_t i = 0; i < data_size; ++i) {
size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
size_t a_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data());
size_t b_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data());
size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
......@@ -37,8 +37,8 @@ void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *
#pragma omp parallel for
for (ptrdiff_t i = 0; i < data_size; ++i) {
size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
size_t a_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data());
size_t b_index = info.contiguous ? i : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data());
size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
if constexpr (std::is_same_v<Tdata, fp16_t>) {
......
......@@ -22,35 +22,6 @@ __mlu_device__ half to_half(const T &v) {
return static_cast<half>(v);
}
/**
* @brief Converts a flattened index to a reduced offset considering broadcasting.
*
* This function is used when dealing with broadcasted tensors where the input
* has been broadcast to match the output shape. It calculates the offset in
* the original (non-broadcasted) tensor.
*
* @param flat_index The flattened index in the output tensor
* @param ndim Number of dimensions
* @param broadcasted_strides Strides of the broadcasted tensor
* @param target_strides Strides of the original (non-broadcasted) tensor
* @return size_t Offset in the original tensor's memory
*/
inline __mlu_device__ size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
// Calculate contribution from each dimension
res += flat_index / broadcasted_strides[i] * target_strides[i];
// Remove the contribution from this dimension
flat_index %= broadcasted_strides[i];
}
return res;
}
/**
* @brief Converts a flattened index to a memory offset considering tensor striding.
*
......
......@@ -2,19 +2,6 @@
namespace op::common_cpu {
size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
size_t indexToOffset(
size_t flat_index,
size_t ndim,
......
......@@ -15,9 +15,6 @@
namespace op::common_cpu {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);
// return the memory offset a tensor given flattened index
size_t indexToOffset(size_t flat_index, size_t ndim, const size_t *shape, const ptrdiff_t *strides);
......
......@@ -105,27 +105,6 @@ inline __device__ T atomicMax(__shared_ptr__ T *ptr, T value) {
return old;
}
/**
* @brief Get index of broadcasted input
* flat_index: flatten index of output tensor
* ndim: dim of output tensor
* broadcasted_strides: strides of output tensor
* target_strides: strides of input tensor
*/
inline __device__ int indexToReducedOffset(
int flat_index, // output flatten index
int ndim, // output dims
const _ptrdiff_t *broadcasted_strides, // output strides
const _ptrdiff_t *target_strides) { // strides of inputs
int res = 0;
for (int i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i].value * target_strides[i].value;
flat_index %= broadcasted_strides[i].value;
}
return res;
}
/**
* @brief Get real offset of input index
* flat_index: flatten index input
......
......@@ -12,21 +12,6 @@ using cuda_bfloat162 = hpcc_bfloat162;
namespace device::metax {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
......
......@@ -16,21 +16,6 @@ using cuda_bfloat162 = mt_bfloat162;
namespace device::moore {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
indexToOffset(
......
......@@ -19,20 +19,6 @@ using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
namespace device::nvidia {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
// get the memory offset of the given element in a tensor given its flat index
__forceinline__ __device__ __host__ size_t
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment