/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef __COMMON_H__ #define __COMMON_H__ #include "rccl/rccl.h" #include #include #include #ifdef MPI_SUPPORT #include "mpi.h" #endif #include #include "nccl1_compat.h" #include "timer.h" // For nccl.h < 2.13 since we define a weak fallback extern "C" char const* ncclGetLastError(ncclComm_t comm); #define HIPCHECK(cmd) do { \ hipError_t e = cmd; \ if( e != hipSuccess ) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf("%s: Test HIP failure %s:%d '%s'\n", \ hostname, \ __FILE__,__LINE__,hipGetErrorString(e)); \ return testCudaError; \ } \ } while(0) #if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0) #define NCCLCHECK(cmd) do { \ ncclResult_t res = cmd; \ if (res != ncclSuccess) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf("%s: Test NCCL failure %s:%d " \ "'%s / %s'\n", \ hostname,__FILE__,__LINE__, \ ncclGetErrorString(res), \ ncclGetLastError(NULL)); \ return testNcclError; \ } \ } while(0) #else #define NCCLCHECK(cmd) do { \ ncclResult_t res = cmd; \ if (res != ncclSuccess) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf("%s: Test NCCL failure %s:%d '%s'\n", \ hostname, \ __FILE__,__LINE__,ncclGetErrorString(res)); \ return testNcclError; \ } \ } while(0) #endif typedef enum { testSuccess = 0, testInternalError = 1, testCudaError = 2, testNcclError = 3, testTimeout = 4, testNumResults = 5 } testResult_t; // Relay errors up and trace #define TESTCHECK(cmd) do { \ testResult_t r = cmd; \ if (r!= testSuccess) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf(" .. %s pid %d: Test failure %s:%d\n", \ hostname, getpid(), \ __FILE__,__LINE__); \ return r; \ } \ } while(0) struct testColl { const char name[20]; void (*getCollByteCount)( size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks); testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; extern struct testColl reduceScatterTest; extern struct testColl broadcastTest; extern struct testColl reduceTest; extern struct testColl alltoAllTest; struct testEngine { void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName); }; extern struct testEngine ncclTestEngine; struct threadArgs { size_t nbytes; size_t minbytes; size_t maxbytes; size_t stepbytes; size_t stepfactor; int totalProcs; int nProcs; int proc; int nThreads; int thread; int nGpus; int* gpus; int localRank; int localNumDevices; int enable_multiranks; int enable_out_of_place; int nRanks; void** sendbuffs; size_t sendBytes; size_t sendInplaceOffset; void** recvbuffs; size_t recvInplaceOffset; ncclUniqueId ncclId; ncclComm_t* comms; hipStream_t* streams; void** expected; size_t expectedBytes; int* errors; double* bw; int* bw_count; int reportErrors; struct testColl* collTest; }; typedef testResult_t (*threadFunc_t)(struct threadArgs* args); struct testThread { pthread_t thread; threadFunc_t func; struct threadArgs args; testResult_t ret; }; // Provided by common.cu extern void Barrier(struct threadArgs* args); extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks); extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank); extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); #include static void getHostName(char* hostname, int maxlen) { gethostname(hostname, maxlen); for (int i=0; i< maxlen; i++) { if (hostname[i] == '.') { hostname[i] = '\0'; return; } } } #include static uint64_t getHash(const char* string, size_t n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (size_t c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * * $(hostname)$(cat /proc/sys/kernel/random/boot_id) * */ #define HOSTID_FILE "/proc/sys/kernel/random/boot_id" static uint64_t getHostHash(const char* hostname) { char hostHash[1024]; // Fall back is the hostname if something fails (void) strncpy(hostHash, hostname, sizeof(hostHash)); int offset = strlen(hostHash); FILE *file = fopen(HOSTID_FILE, "r"); if (file != NULL) { char *p; if (fscanf(file, "%ms", &p) == 1) { strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); free(p); } } fclose(file); // Make sure the string is terminated hostHash[sizeof(hostHash)-1]='\0'; return getHash(hostHash, strlen(hostHash)); } static size_t wordSize(ncclDataType_t type) { switch(type) { case ncclChar: #if NCCL_MAJOR >= 2 //case ncclInt8: case ncclUint8: #endif return 1; case ncclHalf: #if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 case ncclBfloat16: #endif //case ncclFloat16: return 2; case ncclInt: case ncclFloat: #if NCCL_MAJOR >= 2 //case ncclInt32: case ncclUint32: //case ncclFloat32: #endif return 4; case ncclInt64: case ncclUint64: case ncclDouble: //case ncclFloat64: return 8; default: return 0; } } extern int test_ncclVersion; // init'd with ncclGetVersion() typedef enum { ncclCoarse = 0, ncclFine = 1, ncclHost = 2, ncclManaged = 3, nccl_NUM_MTYPES = 4 } ncclMemoryType_t; extern const char *test_memorytypes[nccl_NUM_MTYPES]; constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); extern int test_opnum; extern int test_typenum; extern ncclDataType_t test_types[ncclNumTypes]; extern const char *test_typenames[ncclNumTypes]; extern ncclRedOp_t test_ops[]; extern const char *test_opnames[]; static int ncclstringtotype(char *str) { for (int t=0; t