Unverified Commit 3959c943 authored by zhangyue's avatar zhangyue Committed by GitHub
Browse files

Issue/472: 接入昆仑芯通信库 (#479)

* issue/472: p800 ccl

* issue/472: 删掉无用操作

* issue/472: fix format

* issue/472: memcpy h2h case
parent 20a2dbd6
...@@ -114,7 +114,7 @@ void *testAllReduceThread(void *arg) { ...@@ -114,7 +114,7 @@ void *testAllReduceThread(void *arg) {
TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype))); TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D)); TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H)); TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
if (checkData(output, args->ans, args->dtype, args->count) != 0) { if (checkData(output, args->ans, args->dtype, args->count) != 0) {
...@@ -126,14 +126,14 @@ void *testAllReduceThread(void *arg) { ...@@ -126,14 +126,14 @@ void *testAllReduceThread(void *arg) {
for (size_t i = 0; i < WARM_UPS; i++) { for (size_t i = 0; i < WARM_UPS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
} }
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
// measure time // measure time
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream)); TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, stream));
} }
TEST_INFINI_THREAD(infinirtDeviceSynchronize()); TEST_INFINI_THREAD(infinirtStreamSynchronize(stream));
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count(); double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
*args->time = elapsed_ms / ITERATIONS; *args->time = elapsed_ms / ITERATIONS;
...@@ -159,12 +159,12 @@ int testAllReduce(infiniDevice_t device_type, int ndevice) { ...@@ -159,12 +159,12 @@ int testAllReduce(infiniDevice_t device_type, int ndevice) {
for (int i = 0; i < ndevice; i++) { for (int i = 0; i < ndevice; i++) {
device_ids[i] = i; device_ids[i] = i;
} }
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
for (infiniDtype_t dtype : TEST_DTYPES) { for (infiniDtype_t dtype : TEST_DTYPES) {
setData(dtype, data, MAX_COUNT, 1.0f); setData(dtype, data, MAX_COUNT, 1.0f);
setData(dtype, ans, MAX_COUNT, 1.0f * ndevice); setData(dtype, ans, MAX_COUNT, 1.0f * ndevice);
for (size_t count : TEST_COUNTS) { for (size_t count : TEST_COUNTS) {
TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl; std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl;
for (int rank = 0; rank < ndevice; rank++) { for (int rank = 0; rank < ndevice; rank++) {
thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]}; thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]};
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "./ascend/infiniccl_ascend.h" #include "./ascend/infiniccl_ascend.h"
#include "./cambricon/infiniccl_cambricon.h" #include "./cambricon/infiniccl_cambricon.h"
#include "./cuda/infiniccl_cuda.h" #include "./cuda/infiniccl_cuda.h"
#include "./kunlun/infiniccl_kunlun.h"
#include "./metax/infiniccl_metax.h" #include "./metax/infiniccl_metax.h"
#include "./moore/infiniccl_moore.h" #include "./moore/infiniccl_moore.h"
...@@ -23,6 +24,7 @@ __C infiniStatus_t infinicclCommInitAll( ...@@ -23,6 +24,7 @@ __C infiniStatus_t infinicclCommInitAll(
COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon); COMM_INIT_ALL(INFINI_DEVICE_CAMBRICON, cambricon);
COMM_INIT_ALL(INFINI_DEVICE_METAX, metax); COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore); COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -46,7 +48,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) { ...@@ -46,7 +48,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon); COMM_DESTROY(INFINI_DEVICE_CAMBRICON, cambricon);
COMM_DESTROY(INFINI_DEVICE_METAX, metax); COMM_DESTROY(INFINI_DEVICE_METAX, metax);
COMM_DESTROY(INFINI_DEVICE_MOORE, moore); COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
} }
...@@ -77,6 +79,7 @@ __C infiniStatus_t infinicclAllReduce( ...@@ -77,6 +79,7 @@ __C infiniStatus_t infinicclAllReduce(
ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon); ALL_REDUCE(INFINI_DEVICE_CAMBRICON, cambricon);
ALL_REDUCE(INFINI_DEVICE_METAX, metax); ALL_REDUCE(INFINI_DEVICE_METAX, metax);
ALL_REDUCE(INFINI_DEVICE_MOORE, moore); ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#include "infiniccl_kunlun.h"
#include "../../utils.h"
#include <bkcl.h>
#include <iostream>
#include <vector>
#define CHECK_BKCL(API__) CHECK_INTERNAL(API__, BKCL_SUCCESS)
typedef XPUStream kunlunStream_t;
typedef BKCLContext_t bkclComm_t;
inline kunlunStream_t getKunlunStream(infinirtStream_t stream) {
if (stream == nullptr) {
return 0;
}
return reinterpret_cast<kunlunStream_t>(stream);
}
inline bkclComm_t getBkclComm(infinicclComm_t comm) {
return reinterpret_cast<bkclComm_t>(comm->comm);
}
inline BKCLDataType getBkclDtype(infiniDtype_t datatype) {
switch (datatype) {
case INFINI_DTYPE_F32:
return BKCL_FLOAT;
case INFINI_DTYPE_F16:
return BKCL_FLOAT16;
case INFINI_DTYPE_BF16:
return BKCL_BFLOAT16;
default:
std::cerr << "Unsupported data type: " << datatype << std::endl;
std::abort();
return BKCL_FLOAT16;
}
}
inline BKCLOp getBkclRedOp(infinicclReduceOp_t op) {
switch (op) {
case INFINICCL_SUM:
return BKCL_ADD;
case INFINICCL_PROD:
return BKCL_PRODUCT;
case INFINICCL_MAX:
return BKCL_MAX;
case INFINICCL_MIN:
return BKCL_MIN;
default:
std::abort();
return BKCL_ADD;
}
}
namespace infiniccl::kunlun {
infiniStatus_t commInitAll(
infinicclComm_t *comms,
int ndevice,
const int *device_ids) {
std::vector<bkclComm_t> bkcl_comms(ndevice);
CHECK_BKCL(bkcl_comm_init_all(bkcl_comms.data(), ndevice, device_ids));
for (int i = 0; i < ndevice; i++) {
comms[i] = new InfinicclComm{INFINI_DEVICE_KUNLUN, device_ids[i], (void *)(bkcl_comms[i])};
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t commDestroy(infinicclComm_t comm) {
CHECK_BKCL(bkcl_destroy_context(getBkclComm(comm)));
delete comm;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t allReduce(
void *sendbuf,
void *recvbuf,
size_t count,
infiniDtype_t datatype,
infinicclReduceOp_t op,
infinicclComm_t comm,
infinirtStream_t stream) {
CHECK_DTYPE(datatype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
CHECK_BKCL(bkcl_all_reduce(
getBkclComm(comm),
sendbuf,
recvbuf,
count,
getBkclDtype(datatype),
getBkclRedOp(op),
getKunlunStream(stream)));
return INFINI_STATUS_SUCCESS;
}
} // namespace infiniccl::kunlun
#ifndef INFINICCL_KUNLUN_H_
#define INFINICCL_KUNLUN_H_
#include "../infiniccl_impl.h"
#if defined(ENABLE_KUNLUN_API) && defined(ENABLE_CCL)
INFINICCL_DEVICE_API_IMPL(kunlun)
#else
INFINICCL_DEVICE_API_NOOP(kunlun)
#endif
#endif /* INFINICCL_KUNLUN_H_ */
#include "infinirt_kunlun.h" #include "infinirt_kunlun.h"
#include "../../utils.h" #include "../../utils.h"
#include <cstring>
#include <xpu/runtime.h> #include <xpu/runtime.h>
#include <xpu/runtime_ex.h> #include <xpu/runtime_ex.h>
...@@ -20,6 +21,8 @@ infiniStatus_t setDevice(int device_id) { ...@@ -20,6 +21,8 @@ infiniStatus_t setDevice(int device_id) {
} }
infiniStatus_t deviceSynchronize() { infiniStatus_t deviceSynchronize() {
// TODO: kunlun xpu has no device synchronization API
// xpu_wait() is waiting for default stream
CHECK_KUNLUNRT(xpu_wait()); CHECK_KUNLUNRT(xpu_wait());
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -103,17 +106,36 @@ infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKin ...@@ -103,17 +106,36 @@ infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKin
case INFINIRT_MEMCPY_D2D: case INFINIRT_MEMCPY_D2D:
CHECK_KUNLUNRT(xpu_memcpy(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_DEVICE)); CHECK_KUNLUNRT(xpu_memcpy(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_DEVICE));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
case INFINIRT_MEMCPY_H2H:
std::memcpy(dst, src, size);
return INFINI_STATUS_SUCCESS;
default: default:
return INFINI_STATUS_INTERNAL_ERROR; return INFINI_STATUS_INTERNAL_ERROR;
} }
} }
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) { infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
// no async memcpy func in kunlun2 switch (kind) {
return memcpy(dst, src, size, kind); case INFINIRT_MEMCPY_H2D:
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_HOST_TO_DEVICE, (kunlunStream_t)stream));
return INFINI_STATUS_SUCCESS;
case INFINIRT_MEMCPY_D2H:
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_HOST, (kunlunStream_t)stream));
return INFINI_STATUS_SUCCESS;
case INFINIRT_MEMCPY_D2D:
CHECK_KUNLUNRT(xpu_memcpy_async(dst, src, static_cast<uint64_t>(size), XPUMemcpyKind::XPU_DEVICE_TO_DEVICE, (kunlunStream_t)stream));
return INFINI_STATUS_SUCCESS;
case INFINIRT_MEMCPY_H2H:
std::memcpy(dst, src, size);
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_INTERNAL_ERROR;
}
} }
infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) { infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
// kunlun3 does not support async memory allocation
// TODO: support async malloc
CHECK_KUNLUNRT(xpu_malloc(p_ptr, static_cast<uint64_t>(size))); CHECK_KUNLUNRT(xpu_malloc(p_ptr, static_cast<uint64_t>(size)));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
...@@ -303,6 +303,9 @@ target("infiniccl") ...@@ -303,6 +303,9 @@ target("infiniccl")
if has_config("moore-gpu") then if has_config("moore-gpu") then
add_deps("infiniccl-moore") add_deps("infiniccl-moore")
end end
if has_config("kunlun-xpu") then
add_deps("infiniccl-kunlun")
end
set_languages("cxx17") set_languages("cxx17")
......
...@@ -4,6 +4,7 @@ local XRE_DIR = path.join(KUNLUN_HOME, "xre") ...@@ -4,6 +4,7 @@ local XRE_DIR = path.join(KUNLUN_HOME, "xre")
local XTDK_DIR = path.join(KUNLUN_HOME, "xtdk") local XTDK_DIR = path.join(KUNLUN_HOME, "xtdk")
local XDNN_DIR = path.join(KUNLUN_HOME, "xhpc", "xdnn") local XDNN_DIR = path.join(KUNLUN_HOME, "xhpc", "xdnn")
local XBLAS_DIR = path.join(KUNLUN_HOME, "xhpc", "xblas") local XBLAS_DIR = path.join(KUNLUN_HOME, "xhpc", "xblas")
local XCCL_DIR = path.join(KUNLUN_HOME, "xccl")
-- Add include dirs -- Add include dirs
add_includedirs(path.join(XRE_DIR, "include"), {public = true}) add_includedirs(path.join(XRE_DIR, "include"), {public = true})
...@@ -15,6 +16,8 @@ add_includedirs(path.join(XBLAS_DIR, "include"), {public = true}) ...@@ -15,6 +16,8 @@ add_includedirs(path.join(XBLAS_DIR, "include"), {public = true})
add_linkdirs(path.join(XRE_DIR, "so")) add_linkdirs(path.join(XRE_DIR, "so"))
add_linkdirs(path.join(XDNN_DIR, "so")) add_linkdirs(path.join(XDNN_DIR, "so"))
add_linkdirs(path.join(XBLAS_DIR, "so")) add_linkdirs(path.join(XBLAS_DIR, "so"))
-- Add links
add_links("xpurt", "xpuapi", "xpu_blas") add_links("xpurt", "xpuapi", "xpu_blas")
rule("xpu") rule("xpu")
...@@ -94,5 +97,20 @@ target("infinirt-kunlun") ...@@ -94,5 +97,20 @@ target("infinirt-kunlun")
-- Add include dirs -- Add include dirs
add_files("$(projectdir)/src/infinirt/kunlun/*.cc") add_files("$(projectdir)/src/infinirt/kunlun/*.cc")
add_cxflags("-lstdc++ -Wall -Werror -fPIC") add_cxflags("-lstdc++ -Wall -Werror -fPIC")
target_end()
target("infiniccl-kunlun")
set_kind("static")
add_deps("infinirt")
add_deps("infini-utils")
set_warnings("all", "error")
set_languages("cxx17")
on_install(function (target) end)
if has_config("ccl") then
add_includedirs(path.join(XCCL_DIR, "include"))
add_linkdirs(path.join(XCCL_DIR, "so"))
add_links("bkcl")
add_files("$(projectdir)/src/infiniccl/kunlun/*.cc")
add_cxflags("-lstdc++ -fPIC")
end
target_end() target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment