Commit 46da1a27 authored by PanZezhongQY's avatar PanZezhongQY
Browse files

feat: cpu and cuda matmul

parents
#ifndef __INFINIOP_RMS_NORM_H__
#define __INFINIOP_RMS_NORM_H__
#include "../operator.h"
typedef InfiniopDescriptor *infiniopRMSNormDescriptor_t;
__C __export infiniopStatus_t infiniopCreateRMSNormDescriptor(
infiniopHandle_t handle,
infiniopRMSNormDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t w_desc,
float epsilon);
__C __export infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, size_t *size);
__C __export infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, size_t workspace_size,
void *y, void const *x, void const *w, void *stream);
__C __export infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc);
#endif
#ifndef __INFINIOP_ROTARY_EMBEDDING_H__
#define __INFINIOP_ROTARY_EMBEDDING_H__
#include "../operator.h"
typedef InfiniopDescriptor *infiniopRoPEDescriptor_t;
__C __export infiniopStatus_t infiniopCreateRoPEDescriptor(
infiniopHandle_t handle,
infiniopRoPEDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t t,
infiniopTensorDescriptor_t pos_ids,
infiniopTensorDescriptor_t sin_table,
infiniopTensorDescriptor_t cos_table);
__C __export infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, size_t *size);
__C __export infiniopStatus_t infiniopRoPE(
infiniopRoPEDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *t,
void const *pos_ids,
void const *sin_table,
void const *cos_table,
void *stream);
__C __export infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc);
#endif
#ifndef __INFINIOP_SWIGLU_H__
#define __INFINIOP_SWIGLU_H__
#include "../operator.h"
typedef InfiniopDescriptor *infiniopSwiGLUDescriptor_t;
__C __export infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
infiniopSwiGLUDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
__C __export infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
void *c,
void const *a,
void const *b,
void *stream);
__C __export infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc);
#endif
#ifndef __INFINIOP_STATUS__
#define __INFINIOP_STATUS__
typedef enum {
INFINIOP_STATUS_SUCCESS = 0,
INFINIOP_STATUS_INTERNAL_ERROR = 1,
INFINIOP_STATUS_BAD_PARAM = 2,
INFINIOP_STATUS_BAD_TENSOR_DTYPE = 3,
INFINIOP_STATUS_BAD_TENSOR_SHAPE = 4,
INFINIOP_STATUS_BAD_TENSOR_STRIDES = 5,
INFINIOP_STATUS_NULL_POINTER = 6,
INFINIOP_STATUS_INSUFFICIENT_WORKSPACE = 7,
INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED = 8,
INFINIOP_STATUS_BAD_DEVICE = 9,
INFINIOP_STATUS_UNDEFINED_BEHAVIOR = 10,
} infiniopStatus_t;
#endif
#ifndef __INFINIOP_TENSOR_DESCRIPTOR__
#define __INFINIOP_TENSOR_DESCRIPTOR__
#include "../infinicore.h"
#include "./status.h"
struct InfiniopTensorDescriptor {
// Datatype
infiniDtype_t dtype;
// Number of dimensions
size_t ndim;
// Shape of the tensor, ndim elements
size_t *shape;
// Stride of each dimension in elements, ndim elements
int64_t *strides;
};
typedef struct InfiniopTensorDescriptor *infiniopTensorDescriptor_t;
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, int64_t const *strides, infiniDtype_t dtype);
__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
#endif// __INFINIOP_TENSOR_DESCRIPTOR__
cmake_minimum_required(VERSION 3.16.0)
# project information
project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
ascendc_library(ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp
)
.PHONY: build clean
MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
MKFILE_DIR := $(dir $(MKFILE_PATH))
build:
mkdir -p build && cd build && cmake .. && make -j8
clean:
rm -rf build
#include "ascend_handle.h"
infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id) {
uint32_t device_count;
aclrtGetDeviceCount(&device_count);
if (device_id >= static_cast<int>(device_count)) {
return STATUS_BAD_DEVICE;
}
auto ret = aclrtSetDevice(device_id);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
*handle_ptr = new AscendContext{DevAscendNpu, device_id};
return STATUS_SUCCESS;
}
infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr) {
delete handle_ptr;
return STATUS_SUCCESS;
}
#ifndef ASCEND_HANDLE_H
#define ASCEND_HANDLE_H
#include "common_ascend.h"
#include "device.h"
#include "status.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnn/acl_meta.h>
#include <memory>
struct AscendContext {
Device device;
int device_id;
};
typedef struct AscendContext *AscendHandle_t;
infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id);
infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr);
#endif
#include "common_ascend.h"
int64_t numElements(const int64_t *shape, int64_t num) {
int64_t numEle = 1;
for (int i = 0; i < num; i++) {
numEle *= shape[i];
}
return numEle;
}
infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) {
*workspaceAddr = nullptr;
if (workspaceSize > 0) {
auto ret = aclrtMalloc(workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
}
return STATUS_SUCCESS;
}
infiniopStatus_t freeWorkspace(void *workspaceAddr) {
if (workspaceAddr != nullptr) {
auto ret = aclrtFree(workspaceAddr);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
}
return STATUS_SUCCESS;
}
aclDataType toAclDataType(DT dt) {
if (dt == I8)
return aclDataType::ACL_INT8;
else if (dt == I16)
return aclDataType::ACL_INT16;
else if (dt == I32)
return aclDataType::ACL_INT32;
else if (dt == I64)
return aclDataType::ACL_INT64;
else if (dt == U8)
return aclDataType::ACL_UINT8;
else if (dt == U16)
return aclDataType::ACL_UINT16;
else if (dt == U32)
return aclDataType::ACL_UINT32;
else if (dt == U64)
return aclDataType::ACL_UINT64;
else if (dt == F16)
return aclDataType::ACL_FLOAT16;
else if (dt == BF16)
return aclDataType::ACL_BF16;
else if (dt == F32)
return aclDataType::ACL_FLOAT;
else if (dt == F64)
return aclDataType::ACL_DOUBLE;
else
return aclDataType::ACL_DT_UNDEFINED;
}
const char *dataTypeToString(aclDataType dtype) {
switch (dtype) {
case ACL_DT_UNDEFINED:
return "ACL_DT_UNDEFINED";
case ACL_FLOAT:
return "ACL_FLOAT";
case ACL_FLOAT16:
return "ACL_FLOAT16";
case ACL_INT8:
return "ACL_INT8";
case ACL_INT32:
return "ACL_INT32";
case ACL_UINT8:
return "ACL_UINT8";
case ACL_INT16:
return "ACL_INT16";
case ACL_UINT16:
return "ACL_UINT16";
case ACL_UINT32:
return "ACL_UINT32";
case ACL_INT64:
return "ACL_INT64";
case ACL_UINT64:
return "ACL_UINT64";
case ACL_DOUBLE:
return "ACL_DOUBLE";
case ACL_BOOL:
return "ACL_BOOL";
case ACL_STRING:
return "ACL_STRING";
case ACL_COMPLEX64:
return "ACL_COMPLEX64";
case ACL_COMPLEX128:
return "ACL_COMPLEX128";
case ACL_BF16:
return "ACL_BF16";
case ACL_INT4:
return "ACL_INT4";
case ACL_UINT1:
return "ACL_UINT1";
case ACL_COMPLEX32:
return "ACL_COMPLEX32";
default:
return "UNKNOWN";
}
}
const char *formatToString(aclFormat format) {
switch (format) {
case ACL_FORMAT_UNDEFINED:
return "ACL_FORMAT_UNDEFINED";
case ACL_FORMAT_NCHW:
return "ACL_FORMAT_NCHW";
case ACL_FORMAT_NHWC:
return "ACL_FORMAT_NHWC";
case ACL_FORMAT_ND:
return "ACL_FORMAT_ND";
case ACL_FORMAT_NC1HWC0:
return "ACL_FORMAT_NC1HWC0";
case ACL_FORMAT_FRACTAL_Z:
return "ACL_FORMAT_FRACTAL_Z";
case ACL_FORMAT_NC1HWC0_C04:
return "ACL_FORMAT_NC1HWC0_C04";
case ACL_FORMAT_HWCN:
return "ACL_FORMAT_HWCN";
case ACL_FORMAT_NDHWC:
return "ACL_FORMAT_NDHWC";
case ACL_FORMAT_FRACTAL_NZ:
return "ACL_FORMAT_FRACTAL_NZ";
case ACL_FORMAT_NCDHW:
return "ACL_FORMAT_NCDHW";
case ACL_FORMAT_NDC1HWC0:
return "ACL_FORMAT_NDC1HWC0";
case ACL_FRACTAL_Z_3D:
return "ACL_FRACTAL_Z_3D";
case ACL_FORMAT_NC:
return "ACL_FORMAT_NC";
case ACL_FORMAT_NCL:
return "ACL_FORMAT_NCL";
default:
return "UNKNOWN";
}
}
#ifndef __COMMON_ASCEND_H__
#define __COMMON_ASCEND_H__
#include "operators.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <cstdio>
#include <functional>
#include <inttypes.h>
#include <numeric>
#include <vector>
#ifdef __cplusplus
extern "C" {
#endif
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
#ifdef __cplusplus
};
#endif
int64_t numElements(const int64_t *shape, int64_t num);
const char *dataTypeToString(aclDataType dtype);
const char *formatToString(aclFormat format);
infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize);
infiniopStatus_t freeWorkspace(void *workspaceAddr);
aclDataType toAclDataType(DT dt);
#endif
#include "tensor_aclnn.h"
#include "../../ops/utils.h"
#include <algorithm>
infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
if (shape.size() != strides.size()) {
return STATUS_BAD_PARAM;
}
this->ndim = shape.size();
this->shape = std::vector<int64_t>(shape);
this->strides = std::vector<int64_t>(strides);
this->dataType = dtype;
// Set format
// TODO: Support other format
aclFormat format = aclFormat::ACL_FORMAT_ND;
this->format = format;
CHECK_STATUS(this->inferStorageShape(), STATUS_SUCCESS);
return STATUS_SUCCESS;
}
/// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
/// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
auto index = std::max_element(this->strides.begin(), this->strides.end());
uint64_t max_stride_index = std::distance(this->strides.begin(), index);
this->storageNdim = 1;
this->storageShape = std::vector<int64_t>({this->shape[max_stride_index] * this->strides[max_stride_index]});
return STATUS_SUCCESS;
}
/// @brief Set aclnnTensorDescriptor from infiniopTensorDescriptor
/// @param y infiniopTensorDescriptor
/// @return infiniopStatus_t
infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y) {
uint64_t ndim = y->ndim;
// Cast shape type
auto shape = std::vector<int64_t>(ndim);
auto strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) {
shape[i] = static_cast<int64_t>(y->shape[i]);
strides[i] = y->strides[i];
}
return setDescriptor(toAclDataType(y->dt), shape, strides);
}
/// @brief Wrapper of aclCreateTensor. Create aclTensor.
/// See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha001/apiref/appdevgapi/aclcppdevg_03_0168.html
/// @param desc Alias of aclnnTensorDescriptor*.
/// @param data Data ptr on device global mem.
/// @param tensor Pointer of pointer of aclTensor.
/// @return
infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
if (this->t) {
return STATUS_SUCCESS;
}
this->t = aclCreateTensor(this->shape.data(),
this->ndim,
this->dataType,
this->strides.data(),
this->offset,
this->format,
this->storageShape.data(),
this->storageNdim,
data);
return STATUS_SUCCESS;
}
infiniopStatus_t aclnnTensorDescriptor::destroyTensor() {
auto ret = aclDestroyTensor(this->t);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclDesctroyTensor failed, ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
t = nullptr;
return STATUS_SUCCESS;
}
aclnnTensorDescriptor::~aclnnTensorDescriptor() {
if (this->t) {
destroyTensor();
}
}
/// @brief TensorDescriptor's string info
/// @param desc Alias of aclnnTensorDescriptor*.
/// @return String of aclnnTensorDescriptor.
char *aclnnTensorDescriptor::toString() {
// Assume bufferSize
size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40;
char *buffer = (char *) malloc(bufferSize);
if (!buffer) return NULL;
// Write info into buffer
char *ptr = buffer;
ptr += sprintf(ptr, "ndim: %" PRId64 "\n", this->ndim);
ptr += sprintf(ptr, "shape: [");
for (uint64_t i = 0; i < this->ndim; ++i) {
ptr += sprintf(ptr, "%" PRId64, this->shape[i]);
if (i < this->ndim - 1) {
ptr += sprintf(ptr, ", ");
}
}
ptr += sprintf(ptr, "]\n");
ptr += sprintf(ptr, "stride: [");
for (uint64_t i = 0; i < this->ndim; ++i) {
ptr += sprintf(ptr, "%" PRId64, this->strides[i]);
if (i < this->ndim - 1) {
ptr += sprintf(ptr, ", ");
}
}
ptr += sprintf(ptr, "]\n");
ptr += sprintf(ptr, "offset: %" PRId64 "\n", this->offset);
ptr += sprintf(ptr, "dataType: %s\n", dataTypeToString(this->dataType));
ptr += sprintf(ptr, "format: %s\n", formatToString(this->format));
ptr += sprintf(ptr, "storageShape: [");
for (int64_t i = 0; i < this->storageNdim; ++i) {
ptr += sprintf(ptr, "%" PRId64, this->storageShape[i]);
if (i < this->storageNdim - 1) {
ptr += sprintf(ptr, ", ");
}
}
ptr += sprintf(ptr, "]\n");
ptr += sprintf(ptr, "storageNdim: %" PRId64 "\n", this->storageNdim);
return buffer;
}
#ifndef __ACLNN_TENSOR__
#define __ACLNN_TENSOR__
#include "./common_ascend.h"
#include "operators.h"
#include "tensor.h"
#include "tensor/tensor_descriptor.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <vector>
// Aclnn tensor descriptor,
// used to build aclTensor
struct aclnnTensorDescriptor {
uint64_t ndim;
std::vector<int64_t> shape;
std::vector<int64_t> strides;
int64_t offset;
aclDataType dataType;
aclFormat format;
std::vector<int64_t> storageShape;
int64_t storageNdim;
aclTensor *t;
// Transfer from infiniOp DT to aclDataType
infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides);
infiniopStatus_t inferStorageShape();
// Convert form InfiniOpTensorDescriptor
infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
infiniopStatus_t createTensor(void *data = nullptr);
infiniopStatus_t destroyTensor();
~aclnnTensorDescriptor();
char *toString();
};
typedef aclnnTensorDescriptor *aclnnTensorDescriptor_t;
#endif
#include "bang_handle.h"
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id) {
unsigned int device_count;
cnrtGetDeviceCount(&device_count);
if (device_id >= static_cast<int>(device_count)) {
return INFINIOP_STATUS_BAD_DEVICE;
}
auto pool = std::make_shared<Pool<cnnlHandle_t>>();
if (cnrtSetDevice(device_id) != cnrtSuccess){
return INFINIOP_STATUS_BAD_DEVICE;
}
cnnlHandle_t handle;
cnnlCreate(&handle);
pool->push(std::move(handle));
*handle_ptr = new InfiniopBangHandle{INFINI_DEVICE_CAMBRICON, device_id, std::move(pool)};
return INFINIOP_STATUS_SUCCESS;
}
#ifndef BANG_HANDLE_H
#define BANG_HANDLE_H
#include "../pool.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/handle.h"
#include <memory>
struct InfiniopBangHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
};
typedef struct InfiniopBangHandle *infiniopBangHandle_t;
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id);
template<typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, int device_id, cnrtQueue_t queue, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnrtSetDevice(device_id);
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t) queue);
f(*handle);
pool->push(std::move(*handle));
}
#endif
#ifndef __COMMON_BANG_H__
#define __COMMON_BANG_H__
#include "cnnl.h"
#include "infinicore.h"
#include <vector>
const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768
const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024;
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
std::vector<int> dims(layout->ndim);
for (uint64_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dims.size(), dims.data());
}
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (uint64_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i] / layout->dt.size);
}
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
}
inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
if (dtype_eq(dataType, INFINI_DTYPE_F32)) {
return CNNL_DTYPE_FLOAT;
} else if (dtype_eq(dataType, INFINI_DTYPE_F64)) {
return CNNL_DTYPE_DOUBLE;
} else if (dtype_eq(dataType, INFINI_DTYPE_F16)) {
return CNNL_DTYPE_HALF;
} else if (dtype_eq(dataType, INFINI_DTYPE_I8)) {
return CNNL_DTYPE_INT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_I32)) {
return CNNL_DTYPE_INT32;
} else if (dtype_eq(dataType, INFINI_DTYPE_U8)) {
return CNNL_DTYPE_UINT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_BF16)) {
return CNNL_DTYPE_BFLOAT16;
} else if (dtype_eq(dataType, INFINI_DTYPE_I64)) {
return CNNL_DTYPE_INT64;
} else {
return CNNL_DTYPE_INVALID;
}
}
#endif// __COMMON_BANG_H__
#include "./common_cpu.h"
float f16_to_f32(uint16_t h) {
uint32_t sign = (h & 0x8000) << 16; // Extract the sign bit
int32_t exponent = (h >> 10) & 0x1F;// Extract the exponent
uint32_t mantissa = h & 0x3FF; // Extract the mantissa (fraction part)
if (exponent == 31) {// Special case for Inf and NaN
if (mantissa != 0) {
// NaN: Set float32 NaN
uint32_t f32 = sign | 0x7F800000 | (mantissa << 13);
return *(float *) &f32;
} else {
// Infinity
uint32_t f32 = sign | 0x7F800000;
return *(float *) &f32;
}
} else if (exponent == 0) {// Subnormal float16 or zero
if (mantissa == 0) {
// Zero (positive or negative)
uint32_t f32 = sign;// Just return signed zero
return *(float *) &f32;
} else {
// Subnormal: Convert to normalized float32
exponent = -14; // Set exponent for subnormal numbers
while ((mantissa & 0x400) == 0) {// Normalize mantissa
mantissa <<= 1;
exponent--;
}
mantissa &= 0x3FF;// Clear the leading 1 bit
uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
return *(float *) &f32;
}
} else {
// Normalized float16
uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
return *(float *) &f32;
}
}
uint16_t f32_to_f16(float val) {
uint32_t f32 = *(uint32_t *) &val; // Read the bits of the float32
uint16_t sign = (f32 >> 16) & 0x8000; // Extract the sign bit
int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent
uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part)
if (exponent >= 31) {// Special cases for Inf and NaN
// NaN
if (exponent == 128 && mantissa != 0) {
return sign | 0x7E00;
}
// Infinity
return sign | 0x7C00;
} else if (exponent >= -14) {// Normalized case
return sign | ((exponent + 15) << 10) | (mantissa >> 13);
} else if (exponent >= -24) {
mantissa |= 0x800000;// Add implicit leading 1
mantissa >>= (-14 - exponent);
return sign | (mantissa >> 13);
} else {
// Too small for subnormal: return signed zero
return sign;
}
}
size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broadcasted_strides, int64_t const *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
flat_index %= broadcasted_strides[i];
}
return res;
}
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides) {
size_t res = 0;
for (size_t i = ndim; i-- >= 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
uint64_t total_size = 1;
for (size_t i = 0; i < ndim; ++i) {
total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
}
return total_size;
}
std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads) {
std::vector<size_t> padded_shape(ndim);
memcpy(padded_shape.data(), shape, ndim * sizeof(size_t));
for (size_t i = 2; i < ndim; ++i) {
padded_shape[i] += 2 * pads[i - 2];
}
return std::move(padded_shape);
}
#ifndef __INFINIOP__COMMON_CPU_H__
#define __INFINIOP__COMMON_CPU_H__
#include <cmath>
#include <cstdint>
#include <cstring>
#include <vector>
// convert half-precision float to single-precision float
float f16_to_f32(uint16_t code);
// convert single-precision float to half-precision float
uint16_t f32_to_f16(float val);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broadcasted_strides, int64_t const *target_strides);
// return the memory offset a tensor given flattened index
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides);
/**
* get the total array size (element count) after applying padding for a
* ndim-ary tensor with the given shape
*/
size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
// calculate the padded shape and store the result in padded_shape
std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
#endif// __INFINIOP__COMMON_CPU_H__
#include "./cpu_handle.h"
infiniopStatus_t createCpuHandle(infiniopCpuHandle_t* handle_ptr){
*handle_ptr = new InfiniopHandle{INFINI_DEVICE_CPU, 0};
return INFINIOP_STATUS_SUCCESS;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment