Unverified Commit f385af7a authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #54 from PanZezhong1725/issue/53

issue/53: 形状统一为size_t,步长统一为ptrdiff_t
parents 3c31dc6c 64849b43
......@@ -15,7 +15,7 @@ __C __export infiniopStatus_t infiniopCreateAttentionDescriptor(infiniopHandle_t
infiniopTensorDescriptor_t v_desc,
infiniopTensorDescriptor_t k_cache_desc,
infiniopTensorDescriptor_t v_cache_desc,
uint64_t pos);
size_t pos);
__C __export infiniopStatus_t infiniopGetAttentionWorkspaceSize(infiniopAttentionDescriptor_t desc, size_t *size);
......
......@@ -9,10 +9,10 @@ __C __export infiniopStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t h
infiniopAvgPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
uint64_t const *kernel_shape,
uint64_t const *pads,
int64_t const *strides,
uint64_t n);
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniopStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, size_t *size);
......
......@@ -9,10 +9,10 @@ __C __export infiniopStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t h
infiniopMaxPoolDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
uint64_t const *kernel_shape,
uint64_t const *pads,
int64_t const *strides,
uint64_t n);
size_t const *kernel_shape,
size_t const *pads,
ptrdiff_t const *strides,
size_t n);
__C __export infiniopStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, size_t *size);
......
......@@ -12,12 +12,12 @@ struct InfiniopTensorDescriptor {
// Shape of the tensor, ndim elements
size_t *shape;
// Stride of each dimension in elements, ndim elements
int64_t *strides;
ptrdiff_t *strides;
};
typedef struct InfiniopTensorDescriptor *infiniopTensorDescriptor_t;
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, int64_t const *strides, infiniDtype_t dtype);
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, ptrdiff_t const *strides, infiniDtype_t dtype);
__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
......
......@@ -60,8 +60,8 @@ uint16_t f32_to_f16(float val) {
}
size_t indexToReducedOffset(size_t flat_index, size_t ndim,
int64_t const *broadcasted_strides,
int64_t const *target_strides) {
ptrdiff_t const *broadcasted_strides,
ptrdiff_t const *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
......@@ -71,7 +71,7 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim,
}
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
int64_t const *strides) {
ptrdiff_t const *strides) {
size_t res = 0;
for (size_t i = ndim; i-- >= 0;) {
res += (flat_index % shape[i]) * strides[i];
......@@ -81,7 +81,7 @@ size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
}
size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
uint64_t total_size = 1;
size_t total_size = 1;
for (size_t i = 0; i < ndim; ++i) {
total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
}
......
......@@ -13,10 +13,10 @@ float f16_to_f32(uint16_t code);
uint16_t f32_to_f16(float val);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broadcasted_strides, int64_t const *target_strides);
size_t indexToReducedOffset(size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides, ptrdiff_t const *target_strides);
// return the memory offset a tensor given flattened index
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides);
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, ptrdiff_t const *strides);
/**
* get the total array size (element count) after applying padding for a
......
......@@ -96,8 +96,8 @@ inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline __device__ __host__ size_t indexToReducedOffset(
size_t flat_index, size_t ndim, int64_t const *broadcasted_strides,
int64_t const *target_strides) {
size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides,
ptrdiff_t const *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
......@@ -109,7 +109,7 @@ inline __device__ __host__ size_t indexToReducedOffset(
// get the memory offset of the given element in a tensor given its flat index
inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
size_t const *shape,
int64_t const *strides) {
ptrdiff_t const *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
......
......@@ -40,7 +40,7 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
......@@ -78,7 +78,7 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, size_t workspace_size, void *data, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
......
......@@ -9,11 +9,11 @@
typedef struct BlasMatrix {
size_t ndim;
size_t batch;
int64_t stride;
ptrdiff_t stride;
size_t rows;
size_t cols;
int64_t row_stride;
int64_t col_stride;
ptrdiff_t row_stride;
ptrdiff_t col_stride;
BlasMatrix() {}
......@@ -56,7 +56,7 @@ typedef struct BlasMatrix {
std::swap(row_stride, col_stride);
}
int64_t ld() const {
ptrdiff_t ld() const {
if (this->row_stride == 1) {
return this->col_stride;
} else {
......
......@@ -25,7 +25,7 @@ infiniopStatus_t cpuCreateMatmulDescriptor(
}
infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size) {
size_t *size) {
*size = 0;
return INFINIOP_STATUS_SUCCESS;
}
......@@ -76,7 +76,7 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
}
infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
uint64_t workspace_size, void *c, void const *a,
size_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta) {
if (desc->dtype == INFINI_DTYPE_F16) {
return cpuCalculateMatmul<uint16_t>(desc, c, beta, a, b, alpha);
......
......@@ -14,10 +14,10 @@ infiniopStatus_t cpuCreateMatmulDescriptor(
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size);
size_t *size);
infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
uint64_t workspace_size, void *c, void const *a,
size_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta);
infiniopStatus_t cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc);
......
......@@ -27,7 +27,7 @@ infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size) {
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, size_t *size) {
*size = 0;
return INFINIOP_STATUS_SUCCESS;
}
......
......@@ -13,11 +13,11 @@ infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size);
infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, size_t *size);
infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
size_t workspace_size,
void *c,
void const *a,
void const *b,
......
......@@ -56,7 +56,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
size_t workspace_size,
void *c,
void const *a,
void const *b,
......
......@@ -38,7 +38,7 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
......@@ -77,7 +77,7 @@ __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDe
__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
size_t workspace_size,
void *result,
void const *probs,
float random_val,
......
......@@ -46,7 +46,7 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
......@@ -83,7 +83,7 @@ __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t
return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, size_t workspace_size,
void *y, void const *x, void const *w, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU
......
......@@ -53,7 +53,7 @@ __C infiniopStatus_t infiniopCreateRoPEDescriptor(
}
__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
uint64_t *size) {
size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU
case DevCpu:
......@@ -90,7 +90,7 @@ __C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
}
__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc,
void *workspace, uint64_t workspace_size,
void *workspace, size_t workspace_size,
void *t, void const *pos_ids,
void const *sin_table, void const *cos_table,
void *stream) {
......
......@@ -37,9 +37,9 @@
} \
} while (0)
inline std::vector<int64_t> getByteStrides(infiniopTensorDescriptor_t desc) {
std::vector<int64_t> strides(desc->ndim);
for (uint64_t i = 0; i < desc->ndim; i++) {
inline std::vector<ptrdiff_t> getByteStrides(infiniopTensorDescriptor_t desc) {
std::vector<ptrdiff_t> strides(desc->ndim);
for (size_t i = 0; i < desc->ndim; i++) {
strides[i] = desc->strides[i] * infiniSizeof(desc->dtype);
}
return strides;
......@@ -54,11 +54,11 @@ inline size_t getByteSize(infiniopTensorDescriptor_t desc) {
}
// calculate the broadcasted shape for two tensors
inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
const uint64_t *shape2, uint64_t ndim2,
uint64_t *broadcast_shape,
uint64_t *padded_shape1, uint64_t *padded_shape2,
uint64_t max_rank) {
inline bool getBroadcastShape(const size_t *shape1, size_t ndim1,
const size_t *shape2, size_t ndim2,
size_t *broadcast_shape,
size_t *padded_shape1, size_t *padded_shape2,
size_t max_rank) {
// prepending and initializing
std::fill(padded_shape1, padded_shape1 + max_rank, 1);
std::fill(padded_shape2, padded_shape2 + max_rank, 1);
......@@ -82,8 +82,8 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b,
infiniopTensorDescriptor_t c,
uint64_t broadcast_ndim) {
std::vector<uint64_t> broadcast_shape_(broadcast_ndim),
size_t broadcast_ndim) {
std::vector<size_t> broadcast_shape_(broadcast_ndim),
padded_shape1_(broadcast_ndim), padded_shape2_(broadcast_ndim);
auto broadcast_shape = broadcast_shape_.data(),
padded_shape1 = padded_shape1_.data(),
......@@ -130,7 +130,7 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
return nullptr;
}
size_t *shape = new size_t[ndim];
int64_t *strides = new int64_t[ndim];
ptrdiff_t *strides = new ptrdiff_t[ndim];
for (size_t i = 0; i < ndim; i++) {
if (std::find(order.begin(), order.end(), i) == order.end()) {
return nullptr;
......@@ -146,7 +146,7 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
inline bool isContiguous(const infiniopTensorDescriptor_t &desc,
size_t dim_start, size_t dim_end) {
for (size_t i = dim_start + 1; i <= dim_end; i++) {
if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
if (desc->strides[i - 1] != static_cast<ptrdiff_t>(desc->shape[i]) * desc->strides[i]) {
return false;
}
}
......@@ -170,7 +170,7 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
size_t new_ndim = ndim - (dim_end - dim_start);
size_t *new_shape = new size_t[new_ndim];
int64_t *new_strides = new int64_t[new_ndim];
ptrdiff_t *new_strides = new ptrdiff_t[new_ndim];
size_t index = 0;
for (size_t i = 0; i < dim_start; i++) {
new_shape[index] = desc->shape[i];
......@@ -205,7 +205,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
}
size_t new_ndim = ndim + dims.size() - 1;
size_t *new_shape = new size_t[new_ndim];
int64_t *new_strides = new int64_t[new_ndim];
ptrdiff_t *new_strides = new ptrdiff_t[new_ndim];
size_t index = 0;
for (size_t i = 0; i < dim; i++) {
new_shape[index] = desc->shape[i];
......
#include "infiniop/tensor_descriptor.h"
#include <cstring>
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape_, int64_t const *strides_, infiniDtype_t datatype) {
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape_, ptrdiff_t const *strides_, infiniDtype_t datatype) {
size_t *shape = new size_t[ndim];
int64_t *strides = new int64_t[ndim];
ptrdiff_t *strides = new ptrdiff_t[ndim];
std::memcpy(shape, shape_, ndim * sizeof(size_t));
if (strides_) {
std::memcpy(strides, strides_, ndim * sizeof(int64_t));
std::memcpy(strides, strides_, ndim * sizeof(ptrdiff_t));
} else {
int64_t dsize = 1;
for (int i = ndim - 1; i >= 0; i--) {
ptrdiff_t dsize = 1;
for (size_t i = ndim - 1; i >= 0; i--) {
strides[i] = dsize;
dsize *= shape[i];
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment