/************************************************************************* * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include "StandaloneUtils.hpp" #include #include #include #include #include enum ncclAlg{ AllReduce, ReduceScatter, Broadcast, AllGather, Gather, Reduce, Scatter, SendRecv, AlltoAll, AlltoAllv, ncclAlgCount }; enum cpuType{ cpuIntel, cpuHygon, totalCpuTypes }; enum dcuType{ dcu0 = 0, Vega20 = 0, dcu1 = 1, Device66a1 = 1, dcu2 = 2, Z100SM = 2, dcu3 = 3, DCUK100_AI = 3, totalDcuTypes }; const static float baseBW[totalDcuTypes*totalCpuTypes][ncclAlgCount] = { {9.10, 18.17, 13.50, 18.18, 17.53, 13.59, 18.03, 13.15, 13.21, 10.88}, {9.10, 18.17, 13.50, 18.18, 17.53, 13.59, 18.03, 13.15, 13.21, 10.88}, {9.08, 18.11, 13.56, 18.07, 17.53, 13.56, 18.17, 13.09, 15.61, 12.29}, {9.08, 18.11, 13.56, 18.07, 17.53, 13.56, 18.17, 13.09, 15.61, 12.29}, {9.09, 18.13, 13.57, 18.15, 17.71, 13.59, 18.47, 13.15, 15.61, 12.41}, {9.09, 18.13, 13.57, 18.15, 17.71, 13.59, 18.47, 13.15, 15.61, 12.41}, {18.33, 35.53, 27.70, 36.16, 33.59, 27.72, 36.45, 25.07, 30.39, 24.27}, {13.07, 25.87, 19.59, 25.94, 24.86, 19.50, 25.58, 18.89, 22.57, 17.51} }; const static float deteriorationIndicators = 0.95; double get_time() { struct timeval tp; struct timezone tzp; int i = gettimeofday(&tp, &tzp); return ((double)tp.tv_sec *1000000 + (double)tp.tv_usec); } std::string getCPUName() { std::ifstream cpuinfo("/proc/cpuinfo"); std::string line, name; while (std::getline(cpuinfo, line)) { if (line.find("model name") != std::string::npos || line.find("Processor") != std::string::npos) { size_t start = line.find(":"); if (start != std::string::npos) { name = line.substr(start + 1); name.erase(name.begin(), name.begin() + name.find_first_not_of(" \t\r\n")); std::transform(name.begin(), name.end(), name.begin(), ::tolower); break; } } } return name; } std::string getDCUName() { hipDeviceProp_t props; if (hipSuccess != hipGetDeviceProperties(&props, 0)){ printf("[ ERROR ] Can not get device name!\n"); return "NULL"; } std::string name(props.name); std::transform(name.begin(), name.end(), name.begin(), ::tolower); return name; } float getBaseBW(ncclAlg algType){ std::string cpuName = getCPUName(); int cpuType; if (cpuName.find("intel") != std::string::npos){ cpuType = cpuIntel; }else if (cpuName.find("hygon") != std::string::npos){ cpuType = cpuHygon; }else { printf("[ ERROR ] Get unknown cpu name: %s\n", cpuName.c_str()); return -1.0; } std::string dcuName = getDCUName(); if (std::regex_match(dcuName, std::regex(".*vega.*20.*"))){ return baseBW[dcu0 + cpuType][algType]; }else if (std::regex_match(dcuName, std::regex(".*66a1.*"))){ return baseBW[dcu1 * totalCpuTypes + cpuType][algType]; }else if (std::regex_match(dcuName, std::regex(".*z.*sm.*"))){ return baseBW[dcu2 * totalCpuTypes + cpuType][algType]; }else if (std::regex_match(dcuName, std::regex(".*k.*ai.*"))){ return baseBW[dcu3 * totalCpuTypes + cpuType][algType]; }else{ printf("[ ERROR ] Get unknown device name: %s\n", dcuName.c_str()); return -1.0; } } hipError_t allocResource(int numIntraRank, std::vector &comm, float * &iputCpu, float * &oputCpu, float ** iputGpu, float **oputGpu, hipStream_t* stream, int byteSize){ NCCLCHECK(ncclCommInitAll(comm.data(), numIntraRank, nullptr)); for (int i = 0; i < numIntraRank; i++) { HIPCALL(hipSetDevice(i)); HIPCALL(hipStreamCreate(&stream[i])); } // Allocate GPU memory for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); HIPCALL(hipMalloc((void **)&iputGpu[r], byteSize)); HIPCALL(hipMalloc((void **)&oputGpu[r], byteSize)); } // Allocate CPU memory for input/output iputCpu = (float *)malloc(byteSize); oputCpu = (float *)malloc(byteSize); // Copy the input from CPU memory to GPU memory for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); HIPCALL(hipMemcpy(iputGpu[r], iputCpu, byteSize, hipMemcpyHostToDevice)); } return hipSuccess; } hipError_t releaseResource(int numIntraRank, std::vector& comm, float *iputCpu, float *oputCpu, float **iputGpu, float **oputGpu, hipStream_t* stream){ for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipFree(oputGpu[r])); HIPCALL(hipFree(iputGpu[r])); } free(iputCpu); free(oputCpu); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipStreamDestroy(stream[r])); NCCLCHECK(ncclCommDestroy(comm[r])); } return hipSuccess; } namespace RcclUnitTesting { TEST(Performance, AllReduce) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclAllReduce(iputGpu[r], oputGpu[r], byteSize / sizeof(float), ncclFloat, ncclSum, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(AllReduce); printf("[ INFO ] AllReduce real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, ReduceScatter) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclReduceScatter(iputGpu[r], oputGpu[r], byteSize / sizeof(float) / numIntraRank, ncclFloat, ncclSum, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(ReduceScatter); printf("[ INFO ] ReduceScatter real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, Broadcast) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclBcast(iputGpu[r], byteSize / sizeof(float), ncclFloat, 0, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(Broadcast); printf("[ INFO ] Broadcast real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, AllGather) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclAllGather(iputGpu[r], oputGpu[r], byteSize / sizeof(float) / numIntraRank, ncclFloat, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(AllGather); printf("[ INFO ] AllGather real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, Gather) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclGather(iputGpu[r], oputGpu[r], byteSize / sizeof(float) / numIntraRank, ncclFloat, 0, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(Gather); printf("[ INFO ] Gather real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, Reduce) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclReduce(iputGpu[r], oputGpu[r], byteSize / sizeof(float), ncclFloat, ncclSum, 0, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(Reduce); printf("[ INFO ] Reduce real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, Scatter) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); size_t Offset = byteSize / sizeof(float) / numIntraRank; auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclScatter(iputGpu[r], iputGpu[r]+ Offset * r, byteSize / sizeof(float) / numIntraRank, ncclFloat, 0, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(Scatter); printf("[ INFO ] Scatter real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, SendRecv) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); int nRanks; NCCLCHECK(ncclCommCount(comm[r], &nRanks)); int rank; NCCLCHECK(ncclCommUserRank(comm[r], &rank)); int recvPeer = (rank-1+nRanks) % nRanks; int sendPeer = (rank+1) % nRanks; NCCLCHECK(ncclGroupStart()); NCCLCHECK(ncclSend(iputGpu[r], byteSize / sizeof(float), ncclFloat, sendPeer, comm[r], stream[r])); NCCLCHECK(ncclRecv(oputGpu[r], byteSize / sizeof(float), ncclFloat, recvPeer, comm[r], stream[r])); NCCLCHECK(ncclGroupEnd()); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(SendRecv); printf("[ INFO ] SendRecv real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, AlltoAll) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclAllToAll(iputGpu[r], oputGpu[r], byteSize / sizeof(float) / numIntraRank, ncclFloat, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(AlltoAll); printf("[ INFO ] AlltoAll real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } TEST(Performance, AlltoAllv) { int iterNum = 20; int byteSize = 1024*1024*1024; // Set environment variables to achieve optimal performance if (setenv("NCCL_NCHANNELS_PER_PEER", "4", 1) != 0 || setenv("NCCL_MIN_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MAX_NCHANNELS", "16", 1) != 0 || setenv("NCCL_MIN_P2P_NCHANNELS", "4", 1) != 0 || setenv("NCCL_MAX_P2P_NCHANNELS", "4", 1) != 0) { GTEST_SKIP() << "Failed to set environment variable!"; } // Check for multi-gpu int numIntraRank; HIPCALL(hipGetDeviceCount(&numIntraRank)); if (numIntraRank < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } std::vector comm(numIntraRank); hipStream_t stream[numIntraRank]; float *iputGpu[numIntraRank], *oputGpu[numIntraRank]; float *iputCpu, *oputCpu; // Allocate all resources allocResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream, byteSize); auto commPrimTest = [&](){ for (int iteration = 0; iteration < iterNum; iteration++) { NCCLCHECK(ncclGroupStart()); for (int r = 0; r < numIntraRank; r++) { int nranks; NCCLCHECK(ncclCommCount(comm[r], &nranks)); int rank; NCCLCHECK(ncclCommUserRank(comm[r], &rank)); #define MAX_ALLTOALLV_RANKS 256 static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS]; size_t disp = 0; size_t chunksize = byteSize / sizeof(float) / numIntraRank*2/nranks; for (int i = 0; i < nranks; i++) { size_t scount = ((i+rank)%nranks)*chunksize; if ((i+rank)%nranks == 0) scount += (byteSize / sizeof(float) / numIntraRank*nranks-chunksize*(nranks-1)*nranks/2); sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount; sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp; disp += scount; } HIPCALL(hipSetDevice(r)); NCCLCHECK(ncclAllToAllv(iputGpu[r], sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, oputGpu[r], recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, ncclFloat, comm[r], stream[r])); } NCCLCHECK(ncclGroupEnd()); } for (int r = 0; r < numIntraRank; r++) HIPCALL(hipStreamSynchronize(stream[r])); }; // warmup commPrimTest(); double start = get_time(); commPrimTest(); double costtime = get_time()-start; // Calculate bandwidth double algBW = byteSize / (costtime/iterNum)/1e3; double baseBW = getBaseBW(AlltoAllv); printf("[ INFO ] AlltoAllv real BW of %d cards: %.2f, base BW of 4 cards: %.2f\n", numIntraRank, algBW, baseBW); EXPECT_GT(baseBW, 0); EXPECT_GT(algBW, baseBW * deteriorationIndicators); // Release all resources releaseResource(numIntraRank, comm, iputCpu, oputCpu, iputGpu, oputGpu, stream); } }