Unverified Commit 4e4d3415 authored by Catheriany's avatar Catheriany Committed by GitHub
Browse files

Merge branch 'main' into issue/150

parents d1c46889 1a4cfb99
# InfiniCore # InfiniCore
[![Doc](https://img.shields.io/badge/Document-ready-blue)](https://github.com/InfiniTensor/InfiniCore-Documentation)
[![CI](https://github.com/InfiniTensor/InfiniCore/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/InfiniTensor/InfiniCore/actions)
[![license](https://img.shields.io/github/license/InfiniTensor/InfiniCore)](https://mit-license.org/)
![GitHub repo size](https://img.shields.io/github/repo-size/InfiniTensor/InfiniCore)
![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/InfiniTensor/InfiniCore)
[![GitHub Issues](https://img.shields.io/github/issues/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/pulls)
![GitHub contributors](https://img.shields.io/github/contributors/InfiniTensor/InfiniCore)
![GitHub commit activity](https://img.shields.io/github/commit-activity/m/InfiniTensor/InfiniCore)
InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功能(包括计算、运行时、通信等)提供统一 C 语言接口。目前支持的硬件和后端包括: InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功能(包括计算、运行时、通信等)提供统一 C 语言接口。目前支持的硬件和后端包括:
- CPU; - CPU;
...@@ -15,22 +26,6 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功 ...@@ -15,22 +26,6 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
## 配置和使用 ## 配置和使用
### 软件依赖
- XMake编译器
XMake配置选项(`XMAKE_CONFIG_FLAGS`)以及含义
- `--omp=[y|n]` 是否使用OpenMP,默认开启
- `--cpu=[y|n]` 是否编译CPU接口实现,默认开启
- `--nv-gpu=[y|n]` 是否编译英伟达GPU接口实现
- `--ascend-npu=[y|n]` 是否编译昇腾NPU接口实现
- `--cambricon-mlu=[y|n]` 是否编译寒武纪MLU接口实现
- `--metax-gpu=[y|n]` 是否编译沐曦GPU接口实现
- `--moore-gpu=[y|n]` 是否编译摩尔线程GPU接口实现
- `--sugon-dcu=[y|n]` 是否编译曙光DCU接口实现
- `--kunlun-xpu=[y|n]` 是否编译昆仑XPU接口实现
### 一键安装 ### 一键安装
`script/` 目录中提供了 `install.py` 安装脚本。使用方式如下: `script/` 目录中提供了 `install.py` 安装脚本。使用方式如下:
...@@ -41,6 +36,21 @@ cd InfiniCore ...@@ -41,6 +36,21 @@ cd InfiniCore
python scripts/install.py [XMAKE_CONFIG_FLAGS] python scripts/install.py [XMAKE_CONFIG_FLAGS]
``` ```
参数 `XMAKE_CONFIG_FLAGS` 是 xmake 构建配置,可配置下列可选项:
| 选项 | 功能 | 默认值
|--------------------------|-------------------------------|:-:
| `--omp=[y\|n]` | 是否使用 OpenMP | y
| `--cpu=[y\|n]` | 是否编译 CPU 接口实现 | y
| `--nv-gpu=[y\|n]` | 是否编译英伟达 GPU 接口实现 | n
| `--ascend-npu=[y\|n]` | 是否编译昇腾 NPU 接口实现 | n
| `--cambricon-mlu=[y\|n]` | 是否编译寒武纪 MLU 接口实现 | n
| `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装 ### 手动安装
1. 项目配置 1. 项目配置
...@@ -101,6 +111,18 @@ python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend] ...@@ -101,6 +111,18 @@ python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
详见 `test/infiniop-test` 目录 详见 `test/infiniop-test` 目录
#### 通信库(InfiniCCL)测试
编译(需要先安装InfiniCCL):
```shell
xmake build infiniccl-test
```
在英伟达平台运行测试(会自动使用所有可见的卡):
```shell
infiniccl-test --nvidia
```
## 开发指南 ## 开发指南
### 代码格式化 ### 代码格式化
...@@ -137,3 +159,22 @@ options: ...@@ -137,3 +159,22 @@ options:
- 若设置 `--check`,将检查代码是否需要修改格式,不修改文件内容; - 若设置 `--check`,将检查代码是否需要修改格式,不修改文件内容;
- 通过 `--c` 指定 c/c++ 格式化器,默认为 `clang-format-16` - 通过 `--c` 指定 c/c++ 格式化器,默认为 `clang-format-16`
- 通过 `--python` 指定 python 格式化器 `black` - 通过 `--python` 指定 python 格式化器 `black`
### vscode 开发配置
基本配置见 [xmake 官方文档](https://xmake.io/#/zh-cn/plugin/more_plugins?id=%e9%85%8d%e7%bd%ae-intellsence)
- TL;DR
- clangd
打开 *xmake.lua*,保存一次以触发编译命令生成,将在工作路径下自动生成 *.vscode/compile_commands.json* 文件。然后在这个文件夹下创建 *settings.json*,填入:
> .vscode/settings.json
```json
{
"clangd.arguments": [
"--compile-commands-dir=.vscode"
]
}
```
#ifndef __INFINICCL_API_H__
#define __INFINICCL_API_H__
#include "infinirt.h"
typedef enum {
INFINICCL_SUM = 0,
INFINICCL_PROD = 1,
INFINICCL_MAX = 2,
INFINICCL_MIN = 3,
INFINICCL_AVG = 4,
} infinicclReduceOp_t;
struct InfinicclComm;
typedef struct InfinicclComm *infinicclComm_t;
__C __export infiniStatus_t infinicclCommInitAll(
infiniDevice_t device_type,
infinicclComm_t *comms,
int ndevice,
const int *device_ids);
__C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
__C __export infiniStatus_t infinicclAllReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t dataype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream);
#endif
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "infiniop/ops/global_avg_pool.h" #include "infiniop/ops/global_avg_pool.h"
#include "infiniop/ops/max_pool.h" #include "infiniop/ops/max_pool.h"
#include "infiniop/ops/mlp.h" #include "infiniop/ops/mlp.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/random_sample.h" #include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h" #include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h" #include "infiniop/ops/relu.h"
......
...@@ -11,7 +11,11 @@ __C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, ...@@ -11,7 +11,11 @@ __C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b); infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc, __C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c, void *c,
void const *a, void const *a,
void const *b, void const *b,
......
#ifndef __INFINIOP_MUL_API_H__
#define __INFINIOP_MUL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
#endif
...@@ -12,7 +12,7 @@ os.chdir(PROJECT_DIR) ...@@ -12,7 +12,7 @@ os.chdir(PROJECT_DIR)
def run_tests(args): def run_tests(args):
failed = [] failed = []
for test in [ for test in [
"causal_softmax.py", "add.py",
"gemm.py", "gemm.py",
"random_sample.py", "random_sample.py",
"rms_norm.py", "rms_norm.py",
......
#include "infiniccl_test.hpp"
#include <chrono>
#include <cstring>
#include <iostream>
#include <numeric>
#include <pthread.h>
#include <vector>
#define TEST_INFINI(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return 1)
#define TEST_INFINI_THREAD(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return nullptr)
const size_t MAX_COUNT = 100ULL * 1024 * 1024;
const size_t TEST_COUNTS[] = {
128,
1024,
4 * 1024,
MAX_COUNT,
};
const infiniDtype_t TEST_DTYPES[] = {INFINI_DTYPE_F32, INFINI_DTYPE_F16};
const size_t WARM_UPS = 10;
const size_t ITERATIONS = 100;
struct ThreadArgs {
int rank;
int device_id;
infinicclComm_t comm;
infiniDevice_t device_type;
infiniDtype_t dtype;
size_t count;
const void *data;
const void *ans;
int *result;
double *time;
};
void setData(infiniDtype_t dtype, void *data, size_t count, float val) {
switch (dtype) {
case INFINI_DTYPE_F32:
for (size_t i = 0; i < count; i++) {
((float *)data)[i] = val;
}
break;
case INFINI_DTYPE_F16:
for (size_t i = 0; i < count; i++) {
((fp16_t *)data)[i] = utils::cast<fp16_t>(val);
}
break;
default:
std::abort();
break;
}
}
template <typename T>
int checkData(const T *actual_, const T *expected_, size_t count) {
int failed = 0;
for (size_t i = 0; i < count; i++) {
if constexpr (std::is_same<T, fp16_t>::value) {
float actual = utils::cast<float>(actual_[i]);
float expected = utils::cast<float>(expected_[i]);
if (std::abs(actual - expected) > 1e-4) {
failed += 1;
}
} else {
if (std::abs(actual_[i] - expected_[i]) > 1e-4) {
failed += 1;
}
}
}
return failed;
}
int checkData(const void *actual, const void *expected, infiniDtype_t dtype, size_t count) {
switch (dtype) {
case INFINI_DTYPE_F32:
return checkData((const float *)actual, (const float *)expected, count);
case INFINI_DTYPE_F16:
return checkData((const fp16_t *)actual, (const fp16_t *)expected, count);
default:
std::abort();
return 1;
}
}
void *testAllReduceThread(void *arg) {
ThreadArgs *args = (ThreadArgs *)arg;
*(args->result) = 1;
TEST_INFINI_THREAD(infinirtSetDevice(args->device_type, args->device_id));
void *output = std::malloc(args->count * infiniSizeOf(args->dtype));
std::memset(output, 0, args->count * infiniSizeOf(args->dtype));
void *buf;
TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
if (checkData(output, args->ans, args->dtype, args->count) != 0) {
std::free(output);
infinirtFree(buf);
return nullptr;
}
for (size_t i = 0; i < WARM_UPS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
}
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
// measure time
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < ITERATIONS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
}
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();
double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
*args->time = elapsed_ms / ITERATIONS;
*args->result = 0;
std::free(output);
infinirtFree(buf);
return nullptr;
}
int testAllReduce(infiniDevice_t device_type, int ndevice) {
std::vector<ThreadArgs> thread_args(ndevice);
std::vector<infinicclComm_t> comms(ndevice);
std::vector<pthread_t> threads(ndevice);
std::vector<int> device_ids(ndevice);
std::vector<int> results(ndevice);
std::vector<double> times(ndevice);
void *data = std::malloc(MAX_COUNT * sizeof(float)); // Use float as max dtype size
void *ans = std::malloc(MAX_COUNT * sizeof(float));
for (int i = 0; i < ndevice; i++) {
device_ids[i] = i;
}
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
for (infiniDtype_t dtype : TEST_DTYPES) {
setData(dtype, data, MAX_COUNT, 1.0f);
setData(dtype, ans, MAX_COUNT, 1.0f * ndevice);
for (size_t count : TEST_COUNTS) {
std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl;
for (int rank = 0; rank < ndevice; rank++) {
thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]};
pthread_create(&threads[rank], NULL, testAllReduceThread, &thread_args[rank]);
}
for (int rank = 0; rank < ndevice; rank++) {
pthread_join(threads[rank], NULL);
}
int failed = std::accumulate(results.begin(), results.end(), 0);
for (int rank = 0; rank < ndevice; rank++) {
if (results[rank] != 0) {
std::cout << "Rank " << rank << ": incorrect results." << std::endl;
} else {
std::cout << "Rank " << rank << ": " << times[rank] << " ms." << std::endl;
}
}
if (failed > 0) {
std::cout << "Failed with " << failed << " errors." << std::endl
<< std::endl;
std::free(data);
std::free(ans);
return 1;
}
std::cout << std::endl;
}
}
std::free(data);
std::free(ans);
return 0;
}
#ifndef INFINICCL_TEST_HPP
#define INFINICCL_TEST_HPP
#include <infiniccl.h>
#include "../utils.h"
int testAllReduce(infiniDevice_t device_type, int ndevice);
#endif // INFINICCL_TEST_HPP
#include "infiniccl_test.hpp"
#include <iostream>
struct ParsedArgs {
infiniDevice_t device_type;
};
void printUsage() {
std::cout << "Usage:" << std::endl
<< std::endl;
std::cout << "infiniccl-test --<device>" << std::endl
<< std::endl;
std::cout << " --<device>" << std::endl;
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon)." << std::endl
<< std::endl;
std::cout << "The program will run tests on all visible devices of the specified device type."
<< " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
exit(-1);
}
#define PARSE_DEVICE(FLAG, DEVICE) \
if (arg == FLAG) { \
args.device_type = DEVICE; \
}
ParsedArgs parseArgs(int argc, char *argv[]) {
if (argc != 2) {
printUsage();
}
if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") {
printUsage();
}
ParsedArgs args;
try {
std::string arg = argv[1];
// clang-format off
PARSE_DEVICE("--nvidia", INFINI_DEVICE_NVIDIA)
else PARSE_DEVICE("--cambricon", INFINI_DEVICE_CAMBRICON)
else PARSE_DEVICE("--ascend", INFINI_DEVICE_ASCEND)
else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
else {
printUsage();
}
// clang-format on
} catch (const std::exception &) {
printUsage();
}
return args;
}
int main(int argc, char *argv[]) {
ParsedArgs args = parseArgs(argc, argv);
int ndevice = 0;
if (infinirtGetDeviceCount(args.device_type, &ndevice) != INFINI_STATUS_SUCCESS) {
std::cout << "Failed to get device count" << std::endl;
return -1;
}
if (ndevice == 0) {
std::cout << "No devices found. Tests skipped." << std::endl;
return 0;
} else {
std::cout << "Found " << ndevice << " devices. Running tests..." << std::endl;
}
int failed = 0;
failed += testAllReduce(args.device_type, ndevice);
return failed;
}
#include "infiniccl_cuda.h"
#include <cuda_runtime.h>
#include <iostream>
#include <nccl.h>
#include <vector>
#include "../../utils.h"
#define CHECK_NCCL(API__) CHECK_INTERNAL(API__, ncclSuccess)
inline cudaStream_t getCudaStream(infinirtStream_t stream) {
if (stream == nullptr) {
return 0;
}
return static_cast<cudaStream_t>(stream);
}
inline ncclDataType_t getNcclDtype(infiniDtype_t datatype) {
switch (datatype) {
case INFINI_DTYPE_F32:
return ncclFloat;
case INFINI_DTYPE_F16:
return ncclHalf;
default:
std::abort();
return ncclHalf;
}
}
inline ncclRedOp_t getNcclRedOp(infinicclReduceOp_t op) {
switch (op) {
case INFINICCL_SUM:
return ncclSum;
case INFINICCL_PROD:
return ncclProd;
case INFINICCL_MAX:
return ncclMax;
case INFINICCL_MIN:
return ncclMin;
case INFINICCL_AVG:
return ncclAvg;
default:
std::abort();
return ncclSum;
}
}
inline ncclComm_t getNcclComm(infinicclComm_t comm) {
return static_cast<ncclComm_t>(comm->comm);
}
namespace infiniccl::cuda {
infiniStatus_t commInitAll(
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
std::vector<ncclComm_t> nccl_comms(ndevice);
CHECK_NCCL(ncclCommInitAll(nccl_comms.data(), ndevice, (int const *)device_ids));
for (int i = 0; i < ndevice; i++) {
comms[i] = new InfinicclComm{INFINI_DEVICE_NVIDIA, device_ids[i], (void *)(nccl_comms[i])};
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t commDestroy(infinicclComm_t comm) {
CHECK_NCCL(ncclCommDestroy(getNcclComm(comm)));
delete comm;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t allReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t datatype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
return INFINI_STATUS_BAD_PARAM;
}
CHECK_NCCL(ncclAllReduce(sendbuf, recvbuf, count, getNcclDtype(datatype),
getNcclRedOp(op), getNcclComm(comm), getCudaStream(stream)));
return INFINI_STATUS_SUCCESS;
}
} // namespace infiniccl::cuda
#ifndef INFINICCL_CUDA_H_
#define INFINICCL_CUDA_H_
#include "../infiniccl_impl.h"
// Windows does not support CUDA
#if defined(ENABLE_CUDA_API) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL(cuda)
#else
INFINICCL_DEVICE_API_NOOP(cuda)
#endif
#endif /* INFINICCL_CUDA_H_ */
#include "infiniccl.h"
#include "./cuda/infiniccl_cuda.h"
__C infiniStatus_t infinicclCommInitAll(
infiniDevice_t device_type,
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
#define COMM_INIT_ALL(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commInitAll(comms, ndevice, device_ids);
switch (device_type) {
COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef COMM_INIT_ALL
}
__C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
if (comm == nullptr) {
return INFINI_STATUS_SUCCESS;
}
#define COMM_DESTROY(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::commDestroy(comm);
switch (comm->device_type) {
COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef COMM_DESTROY
}
__C infiniStatus_t infinicclAllReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t dataype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
if (comm == nullptr) {
return INFINI_STATUS_NULL_POINTER;
}
#define ALL_REDUCE(CASE_, NAMESPACE_) \
case CASE_: \
return infiniccl::NAMESPACE_::allReduce(sendbuf, recvbuf, count, dataype, op, comm, stream);
switch (comm->device_type) {
ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef ALL_REDUCE
}
#ifndef INFINICCL_IMPL_H
#define INFINICCL_IMPL_H
#include "infiniccl.h"
struct InfinicclComm {
infiniDevice_t device_type;
int device_id; // the actual device ID, not rank number
void *comm; // the actual communicator
};
#define INFINICCL_DEVICE_API(NAMSPACE, IMPL) \
namespace infiniccl::NAMSPACE { \
infiniStatus_t commInitAll( \
infinicclComm_t *comms, \
int ndevice, \
const int *device_ids) IMPL; \
\
infiniStatus_t commDestroy(infinicclComm_t comm) IMPL; \
\
infiniStatus_t allReduce( \
void *sendbuf, \
void *recvbuf, \
size_t count, \
infiniDtype_t datatype, \
infinicclReduceOp_t op, \
infinicclComm_t comm, \
infinirtStream_t stream) IMPL; \
};
#define INFINICCL_DEVICE_API_IMPL(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, )
#define INFINICCL_DEVICE_API_NOOP(NAMSPACE) \
INFINICCL_DEVICE_API(NAMSPACE, { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; })
#endif // INFINICCL_IMPL_H
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
*/ */
DECLARE_INFINIOP_TEST(gemm) DECLARE_INFINIOP_TEST(gemm)
DECLARE_INFINIOP_TEST(random_sample) DECLARE_INFINIOP_TEST(random_sample)
DECLARE_INFINIOP_TEST(mul)
DECLARE_INFINIOP_TEST(rope) DECLARE_INFINIOP_TEST(rope)
#define REGISTER_INFINIOP_TEST(name) \ #define REGISTER_INFINIOP_TEST(name) \
...@@ -25,7 +26,8 @@ DECLARE_INFINIOP_TEST(rope) ...@@ -25,7 +26,8 @@ DECLARE_INFINIOP_TEST(rope)
{ \ { \
REGISTER_INFINIOP_TEST(gemm) \ REGISTER_INFINIOP_TEST(gemm) \
REGISTER_INFINIOP_TEST(random_sample) \ REGISTER_INFINIOP_TEST(random_sample) \
REGISTER_INFINIOP_TEST(rope) \ REGISTER_INFINIOP_TEST(mul) \
REGISTER_INFINIOP_TEST(rope) \
} }
namespace infiniop_test { namespace infiniop_test {
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::mul {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopMulDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopMul(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::mul
...@@ -102,7 +102,7 @@ struct DeviceImpl::Opaque {}; ...@@ -102,7 +102,7 @@ struct DeviceImpl::Opaque {};
template <typename... Args> template <typename... Args>
utils::Result<DeviceImpl> DeviceImpl::create(Args &&...args) { utils::Result<DeviceImpl> DeviceImpl::create(Args &&...args) {
return utils::Result<DeviceImpl>(nullptr); return INFINI_STATUS_NOT_IMPLEMENTED;
} }
// Perform elementwise operation for different input types // Perform elementwise operation for different input types
......
...@@ -208,7 +208,7 @@ struct DeviceImpl::Opaque { ...@@ -208,7 +208,7 @@ struct DeviceImpl::Opaque {
* @param args Additional arguments forwarded to the operation. * @param args Additional arguments forwarded to the operation.
* @return infiniStatus_t Returns success or failure status. * @return infiniStatus_t Returns success or failure status.
*/ */
template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args> template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info, infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
void *workspace, void *workspace,
void *output, void *output,
...@@ -241,7 +241,7 @@ struct DeviceImpl::Opaque { ...@@ -241,7 +241,7 @@ struct DeviceImpl::Opaque {
* @param args Additional arguments forwarded to the operation. * @param args Additional arguments forwarded to the operation.
* @return infiniStatus_t Returns success or failure status. * @return infiniStatus_t Returns success or failure status.
*/ */
template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args, template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0> std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info, infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
void *workspace, void *workspace,
...@@ -329,7 +329,7 @@ private: ...@@ -329,7 +329,7 @@ private:
* @param args Additional arguments passed to the kernel. * @param args Additional arguments passed to the kernel.
* @return infiniStatus_t Status code indicating success or failure. * @return infiniStatus_t Status code indicating success or failure.
*/ */
template <size_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args> template <uint32_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
infiniStatus_t launchElementwiseKernel( infiniStatus_t launchElementwiseKernel(
const op::elementwise::ElementwiseInfo &info, const op::elementwise::ElementwiseInfo &info,
void *workspace, void *workspace,
...@@ -358,8 +358,8 @@ private: ...@@ -358,8 +358,8 @@ private:
d_output_shape, d_output_strides, d_output_shape, d_output_strides,
d_input_shapes, d_input_strides, stream)); d_input_shapes, d_input_strides, stream));
dim3 blockDims(std::min(BLOCK_SIZE, static_cast<size_t>(internal->maxThreadsPerBlock()))); dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX()))); dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
size_t step = gridDims.x * blockDims.x; size_t step = gridDims.x * blockDims.x;
for (size_t i = 0; i < output_size; i += step) { for (size_t i = 0; i < output_size; i += step) {
......
...@@ -84,8 +84,9 @@ private: ...@@ -84,8 +84,9 @@ private:
_output_contiguous(output_contiguous) {} _output_contiguous(output_contiguous) {}
public: public:
// Get the Memory size of the meta data in bytes
inline size_t getMetaMemSize() const { inline size_t getMetaMemSize() const {
return _meta.size(); return _meta.size() * sizeof(size_t);
} }
inline const int8_t *getMetaStart() const { inline const int8_t *getMetaStart() const {
return reinterpret_cast<const int8_t *>(_meta.data()); return reinterpret_cast<const int8_t *>(_meta.data());
...@@ -167,7 +168,7 @@ public: ...@@ -167,7 +168,7 @@ public:
+ input_size * ndim * sizeof(shape_unit) + input_size * ndim * sizeof(shape_unit)
+ input_size * ndim * sizeof(stride_unit) + input_size * ndim * sizeof(stride_unit)
+ 2 * input_size * sizeof(bool); + 2 * input_size * sizeof(bool);
std::vector<size_t> meta(meta_mem_size); std::vector<size_t> meta(CEIL_DIV(meta_mem_size, sizeof(size_t)));
int8_t *meta_ptr = reinterpret_cast<int8_t *>(meta.data()); int8_t *meta_ptr = reinterpret_cast<int8_t *>(meta.data());
const auto output_shape = output_desc->shape(); const auto output_shape = output_desc->shape();
......
#include "add_cpu.h"
namespace op::add::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<AddOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<AddOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<AddOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::cpu
#ifndef __ADD_CPU_H__
#define __ADD_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(add, cpu)
namespace op::add::cpu {
typedef struct AddOp {
public:
static constexpr size_t num_inputs = 2;
template <typename T>
T operator()(const T &a, const T &b) const {
return a + b;
}
} AddOp;
} // namespace op::add::cpu
#endif // __ADD_CPU_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment