Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
......@@ -30,36 +30,13 @@ jobs:
with:
xmake-version: latest
- name: Build & Install (Linux)
if: matrix.os == 'ubuntu-latest'
run: bash scripts/install.sh . --omp=y
- name: Build & Install (Windows)
if: matrix.os == 'windows-latest'
run: scripts/install.bat . --omp=y
- name: Build & Install
run: python scripts/install.py --omp=y
- name: install python packages
run: |
pip install numpy
pip install torch
- name: Python Test (Linux)
if: matrix.os == 'ubuntu-latest'
run: |
export LD_LIBRARY_PATH=$HOME/.infini/lib:$LD_LIBRARY_PATH
python test/infiniop/gemm.py --cpu
python test/infiniop/rms_norm.py --cpu
python test/infiniop/causal_softmax.py --cpu
python test/infiniop/swiglu.py --cpu
python test/infiniop/random_sample.py --cpu
- name: Python Test (Windows)
if: matrix.os == 'windows-latest'
run: |
set PATH=$env:USERPROFILE\.infini\bin;$env:PATH
python test\infiniop\gemm.py --cpu
python test\infiniop\rms_norm.py --cpu
python test\infiniop\causal_softmax.py --cpu
python test\infiniop\swiglu.py --cpu
python test\infiniop\random_sample.py --cpu
- name: Python Test
run: python scripts/python_test.py --cpu
# InfiniCore
[![Doc](https://img.shields.io/badge/Document-ready-blue)](https://github.com/InfiniTensor/InfiniCore-Documentation)
[![CI](https://github.com/InfiniTensor/InfiniCore/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/InfiniTensor/InfiniCore/actions)
[![license](https://img.shields.io/github/license/InfiniTensor/InfiniCore)](https://mit-license.org/)
![GitHub repo size](https://img.shields.io/github/repo-size/InfiniTensor/InfiniCore)
![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/InfiniTensor/InfiniCore)
[![GitHub Issues](https://img.shields.io/github/issues/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/pulls)
![GitHub contributors](https://img.shields.io/github/contributors/InfiniTensor/InfiniCore)
![GitHub commit activity](https://img.shields.io/github/commit-activity/m/InfiniTensor/InfiniCore)
InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功能(包括计算、运行时、通信等)提供统一 C 语言接口。目前支持的硬件和后端包括:
- CPU;
......@@ -17,18 +28,29 @@ InfiniCore 是一个跨平台统一编程工具集,为不同芯片平台的功
### 一键安装
`script/` 目录中分别提供了 `install.bat` windows安装脚本和 `install.sh` linux安装脚本。使用方式如下:
`script/` 目录中提供了 `install.py` 安装脚本。使用方式如下:
```shell
cd InfiniCore
# Windows
.\scripts\install.bat . --nv-gpu=true
# Linux
source ./scripts/install.sh . --nv-gpu=true
python scripts/install.py [XMAKE_CONFIG_FLAGS]
```
参数 `XMAKE_CONFIG_FLAGS` 是 xmake 构建配置,可配置下列可选项:
| 选项 | 功能 | 默认值
|--------------------------|-------------------------------|:-:
| `--omp=[y\|n]` | 是否使用 OpenMP | y
| `--cpu=[y\|n]` | 是否编译 CPU 接口实现 | y
| `--nv-gpu=[y\|n]` | 是否编译英伟达 GPU 接口实现 | n
| `--ascend-npu=[y\|n]` | 是否编译昇腾 NPU 接口实现 | n
| `--cambricon-mlu=[y\|n]` | 是否编译寒武纪 MLU 接口实现 | n
| `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装
1. 项目配置
......@@ -75,14 +97,32 @@ source ./scripts/install.sh . --nv-gpu=true
#### 运行Python算子测试
```shell
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
```
```shell
python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
```
#### 一键运行所有Python算子测试
```shell
python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
```
#### 算子测试框架
详见 `test/infiniop-test` 目录
#### 通信库(InfiniCCL)测试
编译(需要先安装InfiniCCL):
```shell
xmake build infiniccl-test
```
在英伟达平台运行测试(会自动使用所有可见的卡):
```shell
infiniccl-test --nvidia
```
## 开发指南
### 代码格式化
......@@ -119,3 +159,26 @@ options:
- 若设置 `--check`,将检查代码是否需要修改格式,不修改文件内容;
- 通过 `--c` 指定 c/c++ 格式化器,默认为 `clang-format-16`
- 通过 `--python` 指定 python 格式化器 `black`
### vscode 开发配置
基本配置见 [xmake 官方文档](https://xmake.io/#/zh-cn/plugin/more_plugins?id=%e9%85%8d%e7%bd%ae-intellsence)
- TL;DR
- clangd
打开 *xmake.lua*,保存一次以触发编译命令生成,将在工作路径下自动生成 *.vscode/compile_commands.json* 文件。然后在这个文件夹下创建 *settings.json*,填入:
> .vscode/settings.json
```json
{
"clangd.arguments": [
"--compile-commands-dir=.vscode"
],
"xmake.additionalConfigArguments": [
// 在这里配置 XMAKE_CONFIG_FLAGS
"--nv-gpu=y"
],
}
```
#ifndef __INFINICCL_API_H__
#define __INFINICCL_API_H__
#include "infinirt.h"
typedef enum {
INFINICCL_SUM = 0,
INFINICCL_PROD = 1,
INFINICCL_MAX = 2,
INFINICCL_MIN = 3,
INFINICCL_AVG = 4,
} infinicclReduceOp_t;
struct InfinicclComm;
typedef struct InfinicclComm *infinicclComm_t;
__C __export infiniStatus_t infinicclCommInitAll(
infiniDevice_t device_type,
infinicclComm_t *comms,
int ndevice,
const int *device_ids);
__C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
__C __export infiniStatus_t infinicclAllReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t dataype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream);
#endif
......@@ -6,18 +6,19 @@
#include "infiniop/ops/attention.h"
#include "infiniop/ops/avg_pool.h"
#include "infiniop/ops/causal_softmax.h"
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/expand.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/global_avg_pool.h"
#include "infiniop/ops/gemm.h"
#include "infiniop/ops/max_pool.h"
#include "infiniop/ops/mlp.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h"
#include "infiniop/ops/rms_norm.h"
#include "infiniop/ops/rotary_embedding.h"
#include "infiniop/ops/rope.h"
#include "infiniop/ops/swiglu.h"
#include "infiniop/tensor_descriptor.h"
......
......@@ -11,7 +11,11 @@ __C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
......
......@@ -5,16 +5,20 @@
typedef struct InfiniopDescriptor *infiniopCausalSoftmaxDescriptor_t;
__C __export infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(infiniopHandle_t handle,
__C __export infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
infiniopHandle_t handle,
infiniopCausalSoftmaxDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc);
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc);
__C __export infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc,
__C __export infiniStatus_t infiniopCausalSoftmax(
infiniopCausalSoftmaxDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *data,
void *y,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
......
#ifndef __INFINIOP_CLIP_API_H__
#define __INFINIOP_CLIP_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
infiniopClipDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t min_val,
infiniopTensorDescriptor_t max_val);
__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *min_val,
const void *max_val,
void *stream);
__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
#endif
#ifndef __INFINIOP_MUL_API_H__
#define __INFINIOP_MUL_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
infiniopMulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c,
infiniopTensorDescriptor_t a,
infiniopTensorDescriptor_t b);
__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);
__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
#endif
#ifndef __INFINIOP_ROTARY_EMBEDDING_API_H__
#define __INFINIOP_ROTARY_EMBEDDING_API_H__
#ifndef __INFINIOP_ROPE_API_H__
#define __INFINIOP_ROPE_API_H__
#include "../operator_descriptor.h"
......@@ -8,7 +8,8 @@ typedef struct InfiniopDescriptor *infiniopRoPEDescriptor_t;
__C __export infiniStatus_t infiniopCreateRoPEDescriptor(
infiniopHandle_t handle,
infiniopRoPEDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t t,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
infiniopTensorDescriptor_t pos_ids,
infiniopTensorDescriptor_t sin_table,
infiniopTensorDescriptor_t cos_table);
......@@ -19,7 +20,8 @@ __C __export infiniStatus_t infiniopRoPE(
infiniopRoPEDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *t,
void *y,
const void *x,
void const *pos_ids,
void const *sin_table,
void const *cos_table,
......
......@@ -11,7 +11,11 @@ __C __export infiniStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t hand
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
__C __export infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
void const *a,
void const *b,
......
@echo off
setlocal
:: Check if project path is provided
if "%~0"=="" (
echo Usage: install.bat PROJECT_PATH [XMAKE_CONFIG_FLAGS]
exit /b 1
)
:: Set INFINI_ROOT
set "INFINI_ROOT=%USERPROFILE%\.infini"
:: Check if INFINI_ROOT\bin is already in PATH, if not, add it
echo %PATH% | findstr /I /C:"%INFINI_ROOT%\bin" >nul
if %errorlevel% neq 0 set "PATH=%INFINI_ROOT%\bin;%PATH%"
:: Convert relative path to absolute path
for %%I in ("%~1") do set ABS_PATH=%%~fI
:: Change to the project directory
cd %ABS_PATH%
:: Build xmake config flags
set XMAKE_FLAGS=
set i=0
for %%A in (%*) do (
if !i! gtr set XMAKE_FLAGS=!XMAKE_FLAGS! %%A
set /a i+=1
)
:: Start installation
xmake clean -a
if %errorlevel% neq 0 exit /b %errorlevel%
xmake f %XMAKE_FLAGS% -cv
if %errorlevel% neq 0 exit /b %errorlevel%
xmake
if %errorlevel% neq 0 exit /b %errorlevel%
xmake install
if %errorlevel% neq 0 exit /b %errorlevel%
xmake build infiniop-test
if %errorlevel% neq 0 exit /b %errorlevel%
xmake install infiniop-test
if %errorlevel% neq 0 exit /b %errorlevel%
import os
import subprocess
import platform
import sys
from set_env import set_env
PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
os.chdir(PROJECT_DIR)
def run_cmd(cmd):
subprocess.run(cmd, text=True, encoding="utf-8", check=True, shell=True)
def install(xmake_config_flags=""):
run_cmd(f"xmake f {xmake_config_flags} -cv")
run_cmd("xmake")
run_cmd("xmake install")
run_cmd("xmake build infiniop-test")
run_cmd("xmake install infiniop-test")
if __name__ == "__main__":
set_env()
install(" ".join(sys.argv[1:]))
#!/bin/bash
set -e # Exit on error
# Check if project path is provided
if [ -z "$1" ]; then
echo "Usage: source deploy.sh PROJECT_PATH [XMAKE_CONFIG_FLAGS]"
exit 1
fi
# Set INFINI_ROOT
export INFINI_ROOT="$HOME/.infini"
# Check if INFINI_ROOT/bin is already in PATH, if not, add it
case ":$PATH:" in
*":$INFINI_ROOT/bin:"*) ;; # Already in PATH, do nothing
*) export PATH="$INFINI_ROOT/bin:$PATH" ;; # Add to PATH
esac
# Check if INFINI_ROOT/lib is already in LD_LIBRARY_PATH, if not, add it
case ":$LD_LIBRARY_PATH:" in
*":$INFINI_ROOT/lib:"*) ;; # Already in LD_LIBRARY_PATH, do nothing
*) export LD_LIBRARY_PATH="$INFINI_ROOT/lib:$LD_LIBRARY_PATH" ;; # Add to LD_LIBRARY_PATH
esac
# Change to project directory
cd "$1"
# Shift first argument (project path) and pass the rest to xmake
shift
xmake clean -a
xmake f "$@" -cv
xmake
xmake install
xmake build infiniop-test
xmake install infiniop-test
import os
import subprocess
from set_env import set_env
import sys
PROJECT_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
)
os.chdir(PROJECT_DIR)
def run_tests(args):
failed = []
for test in [
"add.py",
"gemm.py",
"random_sample.py",
"rms_norm.py",
"rope.py",
"swiglu.py",
"attention.py",
]:
result = subprocess.run(
f"python {test} {args}", text=True, encoding="utf-8", shell=True
)
if result.returncode != 0:
failed.append(test)
return failed
if __name__ == "__main__":
set_env()
failed = run_tests(" ".join(sys.argv[1:]))
if len(failed) == 0:
print("\033[92mAll tests passed!\033[0m")
else:
print("\033[91mThe following tests failed:\033[0m")
for test in failed:
print(f"\033[91m - {test}\033[0m")
exit(len(failed))
import os
import platform
def set_env():
if os.environ.get("INFINI_ROOT") == None:
os.environ["INFINI_ROOT"] = os.path.expanduser("~/.infini")
if platform.system() == "Windows":
new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
if new_path not in os.environ.get("PATH", ""):
os.environ["PATH"] = f"{new_path};{os.environ.get('PATH', '')}"
elif platform.system() == "Linux":
new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
if new_path not in os.environ.get("PATH", ""):
os.environ["PATH"] = f"{new_path}:{os.environ.get('PATH', '')}"
new_lib_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/lib")
if new_lib_path not in os.environ.get("LD_LIBRARY_PATH", ""):
os.environ["LD_LIBRARY_PATH"] = (
f"{new_lib_path}:{os.environ.get('LD_LIBRARY_PATH', '')}"
)
else:
raise RuntimeError("Unsupported platform.")
#include "infiniccl_test.hpp"
#include <chrono>
#include <cstring>
#include <iostream>
#include <numeric>
#include <pthread.h>
#include <vector>
#define TEST_INFINI(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return 1)
#define TEST_INFINI_THREAD(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return nullptr)
const size_t MAX_COUNT = 100ULL * 1024 * 1024;
const size_t TEST_COUNTS[] = {
128,
1024,
4 * 1024,
MAX_COUNT,
};
const infiniDtype_t TEST_DTYPES[] = {INFINI_DTYPE_F32, INFINI_DTYPE_F16};
const size_t WARM_UPS = 10;
const size_t ITERATIONS = 100;
struct ThreadArgs {
int rank;
int device_id;
infinicclComm_t comm;
infiniDevice_t device_type;
infiniDtype_t dtype;
size_t count;
const void *data;
const void *ans;
int *result;
double *time;
};
void setData(infiniDtype_t dtype, void *data, size_t count, float val) {
switch (dtype) {
case INFINI_DTYPE_F32:
for (size_t i = 0; i < count; i++) {
((float *)data)[i] = val;
}
break;
case INFINI_DTYPE_F16:
for (size_t i = 0; i < count; i++) {
((fp16_t *)data)[i] = utils::cast<fp16_t>(val);
}
break;
default:
std::abort();
break;
}
}
template <typename T>
int checkData(const T *actual_, const T *expected_, size_t count) {
int failed = 0;
for (size_t i = 0; i < count; i++) {
if constexpr (std::is_same<T, fp16_t>::value) {
float actual = utils::cast<float>(actual_[i]);
float expected = utils::cast<float>(expected_[i]);
if (std::abs(actual - expected) > 1e-4) {
failed += 1;
}
} else {
if (std::abs(actual_[i] - expected_[i]) > 1e-4) {
failed += 1;
}
}
}
return failed;
}
int checkData(const void *actual, const void *expected, infiniDtype_t dtype, size_t count) {
switch (dtype) {
case INFINI_DTYPE_F32:
return checkData((const float *)actual, (const float *)expected, count);
case INFINI_DTYPE_F16:
return checkData((const fp16_t *)actual, (const fp16_t *)expected, count);
default:
std::abort();
return 1;
}
}
void *testAllReduceThread(void *arg) {
ThreadArgs *args = (ThreadArgs *)arg;
*(args->result) = 1;
TEST_INFINI_THREAD(infinirtSetDevice(args->device_type, args->device_id));
void *output = std::malloc(args->count * infiniSizeOf(args->dtype));
std::memset(output, 0, args->count * infiniSizeOf(args->dtype));
void *buf;
TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
if (checkData(output, args->ans, args->dtype, args->count) != 0) {
std::free(output);
infinirtFree(buf);
return nullptr;
}
for (size_t i = 0; i < WARM_UPS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
}
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
// measure time
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < ITERATIONS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
}
TEST_INFINI_THREAD(infinirtDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();
double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
*args->time = elapsed_ms / ITERATIONS;
*args->result = 0;
std::free(output);
infinirtFree(buf);
return nullptr;
}
int testAllReduce(infiniDevice_t device_type, int ndevice) {
std::vector<ThreadArgs> thread_args(ndevice);
std::vector<infinicclComm_t> comms(ndevice);
std::vector<pthread_t> threads(ndevice);
std::vector<int> device_ids(ndevice);
std::vector<int> results(ndevice);
std::vector<double> times(ndevice);
void *data = std::malloc(MAX_COUNT * sizeof(float)); // Use float as max dtype size
void *ans = std::malloc(MAX_COUNT * sizeof(float));
for (int i = 0; i < ndevice; i++) {
device_ids[i] = i;
}
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
for (infiniDtype_t dtype : TEST_DTYPES) {
setData(dtype, data, MAX_COUNT, 1.0f);
setData(dtype, ans, MAX_COUNT, 1.0f * ndevice);
for (size_t count : TEST_COUNTS) {
std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl;
for (int rank = 0; rank < ndevice; rank++) {
thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]};
pthread_create(&threads[rank], NULL, testAllReduceThread, &thread_args[rank]);
}
for (int rank = 0; rank < ndevice; rank++) {
pthread_join(threads[rank], NULL);
}
int failed = std::accumulate(results.begin(), results.end(), 0);
for (int rank = 0; rank < ndevice; rank++) {
if (results[rank] != 0) {
std::cout << "Rank " << rank << ": incorrect results." << std::endl;
} else {
std::cout << "Rank " << rank << ": " << times[rank] << " ms." << std::endl;
}
}
if (failed > 0) {
std::cout << "Failed with " << failed << " errors." << std::endl
<< std::endl;
std::free(data);
std::free(ans);
return 1;
}
std::cout << std::endl;
}
}
std::free(data);
std::free(ans);
return 0;
}
#ifndef INFINICCL_TEST_HPP
#define INFINICCL_TEST_HPP
#include <infiniccl.h>
#include "../utils.h"
int testAllReduce(infiniDevice_t device_type, int ndevice);
#endif // INFINICCL_TEST_HPP
#include "infiniccl_test.hpp"
#include <iostream>
struct ParsedArgs {
infiniDevice_t device_type;
};
void printUsage() {
std::cout << "Usage:" << std::endl
<< std::endl;
std::cout << "infiniccl-test --<device>" << std::endl
<< std::endl;
std::cout << " --<device>" << std::endl;
std::cout << " Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon)." << std::endl
<< std::endl;
std::cout << "The program will run tests on all visible devices of the specified device type."
<< " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
exit(-1);
}
#define PARSE_DEVICE(FLAG, DEVICE) \
if (arg == FLAG) { \
args.device_type = DEVICE; \
}
ParsedArgs parseArgs(int argc, char *argv[]) {
if (argc != 2) {
printUsage();
}
if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") {
printUsage();
}
ParsedArgs args;
try {
std::string arg = argv[1];
// clang-format off
PARSE_DEVICE("--nvidia", INFINI_DEVICE_NVIDIA)
else PARSE_DEVICE("--cambricon", INFINI_DEVICE_CAMBRICON)
else PARSE_DEVICE("--ascend", INFINI_DEVICE_ASCEND)
else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
else PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
else {
printUsage();
}
// clang-format on
} catch (const std::exception &) {
printUsage();
}
return args;
}
int main(int argc, char *argv[]) {
ParsedArgs args = parseArgs(argc, argv);
int ndevice = 0;
if (infinirtGetDeviceCount(args.device_type, &ndevice) != INFINI_STATUS_SUCCESS) {
std::cout << "Failed to get device count" << std::endl;
return -1;
}
if (ndevice == 0) {
std::cout << "No devices found. Tests skipped." << std::endl;
return 0;
} else {
std::cout << "Found " << ndevice << " devices. Running tests..." << std::endl;
}
int failed = 0;
failed += testAllReduce(args.device_type, ndevice);
return failed;
}
#include "infiniccl_cuda.h"
#include <cuda_runtime.h>
#include <iostream>
#include <nccl.h>
#include <vector>
#include "../../utils.h"
#define CHECK_NCCL(API__) CHECK_INTERNAL(API__, ncclSuccess)
inline cudaStream_t getCudaStream(infinirtStream_t stream) {
if (stream == nullptr) {
return 0;
}
return static_cast<cudaStream_t>(stream);
}
inline ncclDataType_t getNcclDtype(infiniDtype_t datatype) {
switch (datatype) {
case INFINI_DTYPE_F32:
return ncclFloat;
case INFINI_DTYPE_F16:
return ncclHalf;
default:
std::abort();
return ncclHalf;
}
}
inline ncclRedOp_t getNcclRedOp(infinicclReduceOp_t op) {
switch (op) {
case INFINICCL_SUM:
return ncclSum;
case INFINICCL_PROD:
return ncclProd;
case INFINICCL_MAX:
return ncclMax;
case INFINICCL_MIN:
return ncclMin;
case INFINICCL_AVG:
return ncclAvg;
default:
std::abort();
return ncclSum;
}
}
inline ncclComm_t getNcclComm(infinicclComm_t comm) {
return static_cast<ncclComm_t>(comm->comm);
}
namespace infiniccl::cuda {
infiniStatus_t commInitAll(
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
std::vector<ncclComm_t> nccl_comms(ndevice);
CHECK_NCCL(ncclCommInitAll(nccl_comms.data(), ndevice, (int const *)device_ids));
for (int i = 0; i < ndevice; i++) {
comms[i] = new InfinicclComm{INFINI_DEVICE_NVIDIA, device_ids[i], (void *)(nccl_comms[i])};
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t commDestroy(infinicclComm_t comm) {
CHECK_NCCL(ncclCommDestroy(getNcclComm(comm)));
delete comm;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t allReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t datatype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
return INFINI_STATUS_BAD_PARAM;
}
CHECK_NCCL(ncclAllReduce(sendbuf, recvbuf, count, getNcclDtype(datatype),
getNcclRedOp(op), getNcclComm(comm), getCudaStream(stream)));
return INFINI_STATUS_SUCCESS;
}
} // namespace infiniccl::cuda
#ifndef INFINICCL_CUDA_H_
#define INFINICCL_CUDA_H_
#include "../infiniccl_impl.h"
// Windows does not support CUDA
#if defined(ENABLE_CUDA_API) && defined(ENABLE_CCL) && !defined(_WIN32)
INFINICCL_DEVICE_API_IMPL(cuda)
#else
INFINICCL_DEVICE_API_NOOP(cuda)
#endif
#endif /* INFINICCL_CUDA_H_ */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment