/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "hip/hip_runtime.h" #include "common.h" #define ALIGN 4 void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { size_t base = (count/(ALIGN*nranks))*ALIGN; *sendcount = base; *recvcount = base*nranks; *sendInplaceOffset = base; *recvInplaceOffset = 0; *paramcount = base; } testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } k++; } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; *algBw = baseBw; double factor = 1; *busBw = baseBw * factor; } testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { char* sbuff = (char*)sendbuff; char* rbuff = (char*)recvbuff; int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); size_t rankSize = count * wordSize(type); if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream)); // Hypercube AllGather for (int mask=1; maskcollTest = &hyperCubeTest; ncclDataType_t *run_types; const char **run_typenames; int type_count; if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; } else { type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } // Check if this is a power of 2 int nRanks = args->nProcs*args->nThreads*args->nGpus; if (nRanks && !(nRanks & (nRanks - 1))) { for (int i=0; i