/************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include "common.h" #define USE_RCCL_GATHER_SCATTER void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { if (count < nranks*nranks/2) { *sendcount = 0; *recvcount = 0; *sendInplaceOffset = 0; *recvInplaceOffset = 0; *paramcount = 0; } else { *sendcount = (count/nranks)*nranks; *recvcount = (count/nranks)*nranks; *sendInplaceOffset = 0; *recvInplaceOffset = 0; *paramcount = count/nranks; } } testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); #if 0 int *dataHost = (int *)malloc(args->sendBytes); hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); printf(" Rank [%d] Original: ", rank); for(int j=0; jexpected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); rdisp += rcount; } k++; } HIPCHECK(hipDeviceSynchronize()); } // We don't support in-place alltoall args->reportErrors = in_place ? 0 : 1; return testSuccess; } void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; *algBw = baseBw; double factor = ((double)(nranks-1))/((double)(nranks)); *busBw = baseBw * factor; } testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { int nranks; NCCLCHECK(ncclCommCount(comm, &nranks)); int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); if (count == 0) return testSuccess; size_t *sendcounts, *recvcounts, *sdispls, *rdispls; sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t)); recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t)); sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t)); rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t)); if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) { printf("failed to allocate buffers for alltoallv\n"); return testNcclError; } size_t disp = 0; size_t chunksize = count*2/nranks; for (int i = 0; i < nranks; i++) { size_t scount = ((i+rank)%nranks)*chunksize; if ((i+rank)%nranks == 0) scount += (count*nranks-chunksize*(nranks-1)*nranks/2); sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount; sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp; disp += scount; //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type)); } #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); return testNcclError; #else #if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER) NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream)); #else NCCLCHECK(ncclGroupStart()); for (int r=0; rcollTest = &alltoAllTest; ncclDataType_t *run_types; const char **run_typenames; int type_count; if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; } else { type_count = ncclNumTypes; run_types = test_types; run_typenames = test_typenames; } for (int i=0; i