Commit c0dad530 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parents
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = count/nranks;
*paramcount = count/nranks;
}
testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankOffset = count * wordSize(type);
if (count == 0) return testSuccess;
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
}
}
NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl scatterTest = {
"Scatter",
ScatterGetCollByteCount,
ScatterInitData,
ScatterGetBw,
ScatterRunColl
};
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &scatterTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ScatterGetBuffSize,
ScatterRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = (rank-1+nranks)%nranks;
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place sendrecv
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
int recvPeer = (rank-1+nRanks) % nRanks;
int sendPeer = (rank+1) % nRanks;
NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl sendRecvTest = {
"SendRecv",
SendRecvGetCollByteCount,
SendRecvInitData,
SendRecvGetBw,
SendRecvRunColl
};
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &sendRecvTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
SendRecvGetBuffSize,
SendRecvRunTest
};
#include "timer.h"
// Make sure to compile this translation unit with the host compiler and not
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
#include <chrono>
namespace {
std::uint64_t now() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
}
}
timer::timer() {
t0 = now();
}
double timer::elapsed() const {
std::uint64_t t1 = now();
return 1.e-9*(t1 - t0);
}
double timer::reset() {
std::uint64_t t1 = now();
double ans = 1.e-9*(t1 - t0);
t0 = t1;
return ans;
}
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
#define _408319ecdd5b47b28bf8f511c4fdf816
#include <cstdint>
// Can't include <chrono> because of bug with gcc 10.3.0
class timer {
std::uint64_t t0;
public:
timer();
double elapsed() const;
double reset();
};
#endif
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 781 on master device 0 [0000:9f:00.0] BW200
master:781:781 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:781:781 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:781:781 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:781:781 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:781:781 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:781:781 [0] NCCL INFO ROCr version 1.1
master:781:781 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:781:814 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:781:814 [0] NCCL INFO Using network IB
master:781:814 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xb95015ec1d0d080e - Init START
master:781:814 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:781:814 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:781:814 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:781:814 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:781:814 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:781:814 [0] NCCL INFO Channel 00/32 : 0
master:781:814 [0] NCCL INFO Channel 01/32 : 0
master:781:814 [0] NCCL INFO Channel 02/32 : 0
master:781:814 [0] NCCL INFO Channel 03/32 : 0
master:781:814 [0] NCCL INFO Channel 04/32 : 0
master:781:814 [0] NCCL INFO Channel 05/32 : 0
master:781:814 [0] NCCL INFO Channel 06/32 : 0
master:781:814 [0] NCCL INFO Channel 07/32 : 0
master:781:814 [0] NCCL INFO Channel 08/32 : 0
master:781:814 [0] NCCL INFO Channel 09/32 : 0
master:781:814 [0] NCCL INFO Channel 10/32 : 0
master:781:814 [0] NCCL INFO Channel 11/32 : 0
master:781:814 [0] NCCL INFO Channel 12/32 : 0
master:781:814 [0] NCCL INFO Channel 13/32 : 0
master:781:814 [0] NCCL INFO Channel 14/32 : 0
master:781:814 [0] NCCL INFO Channel 15/32 : 0
master:781:814 [0] NCCL INFO Channel 16/32 : 0
master:781:814 [0] NCCL INFO Channel 17/32 : 0
master:781:814 [0] NCCL INFO Channel 18/32 : 0
master:781:814 [0] NCCL INFO Channel 19/32 : 0
master:781:814 [0] NCCL INFO Channel 20/32 : 0
master:781:814 [0] NCCL INFO Channel 21/32 : 0
master:781:814 [0] NCCL INFO Channel 22/32 : 0
master:781:814 [0] NCCL INFO Channel 23/32 : 0
master:781:814 [0] NCCL INFO Channel 24/32 : 0
master:781:814 [0] NCCL INFO Channel 25/32 : 0
master:781:814 [0] NCCL INFO Channel 26/32 : 0
master:781:814 [0] NCCL INFO Channel 27/32 : 0
master:781:814 [0] NCCL INFO Channel 28/32 : 0
master:781:814 [0] NCCL INFO Channel 29/32 : 0
master:781:814 [0] NCCL INFO Channel 30/32 : 0
master:781:814 [0] NCCL INFO Channel 31/32 : 0
master:781:814 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xf33c60 nRanks 01 busId 9f000
master:781:814 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:781:814 [0] NCCL INFO P2P Chunksize set to 131072
master:781:814 [0] NCCL INFO Connected all rings comm 0xf33c60 nRanks 01 busId 9f000
master:781:814 [0] NCCL INFO Connected all trees
master:781:814 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:781:814 [0] NCCL INFO Init config for nccl_context_test: 0
master:781:814 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:781:814 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xb95015ec1d0d080e localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 318.2 0.02 0.00 0 0.17 44.67 0.00 0
15236 3809 float sum -1 148.8 0.10 0.00 0 0.17 91.51 0.00 0
30472 7618 float sum -1 3.72 8.19 0.00 0 0.17 183.02 0.00 0
60944 15236 float sum -1 5.67 10.75 0.00 0 0.17 366.03 0.00 0
121888 30472 float sum -1 3.93 31.04 0.00 0 0.17 727.69 0.00 0
243776 60944 float sum -1 498.9 0.49 0.00 0 0.17 1421.43 0.00 0
487552 121888 float sum -1 82.86 5.88 0.00 0 0.17 2937.06 0.00 0
975104 243776 float sum -1 10.57 92.27 0.00 0 0.17 5891.87 0.00 0
1950208 487552 float sum -1 1273.7 1.53 0.00 0 0.17 11677.89 0.00 0
3900416 975104 float sum -1 2760.9 1.41 0.00 0 0.17 23425.92 0.00 0
7800832 1950208 float sum -1 2038.5 3.83 0.00 0 0.17 46851.84 0.00 0
15601664 3900416 float sum -1 4799.2 3.25 0.00 0 0.17 93985.93 0.00 0
31203328 7800832 float sum -1 9049.6 3.45 0.00 0 0.16 190264.20 0.00 0
62406656 15601664 float sum -1 10579 5.90 0.00 0 0.17 372577.05 0.00 0
124813312 31203328 float sum -1 5672.0 22.01 0.00 0 0.17 749629.50 0.00 0
249626624 62406656 float sum -1 7586.1 32.91 0.00 0 0.17 1490308.20 0.00 0
499253248 124813312 float sum -1 11629 42.93 0.00 0 0.17 2980616.41 0.00 0
998506496 249626624 float sum -1 16813 59.39 0.00 0 0.17 5961232.81 0.00 0
master:781:781 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:781:781 [0] NCCL INFO comm 0xf33c60 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 782 on master device 0 [0000:9f:00.0] BW200
master:782:782 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:782:782 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:782:782 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:782:782 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:782:782 [0] NCCL INFO ROCr version 1.1
master:782:782 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:782:816 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:782:816 [0] NCCL INFO Using network IB
master:782:816 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x2a9de5f131333451 - Init START
master:782:816 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:782:816 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:782:816 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:782:816 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:782:816 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:782:816 [0] NCCL INFO Channel 00/32 : 0
master:782:816 [0] NCCL INFO Channel 01/32 : 0
master:782:816 [0] NCCL INFO Channel 02/32 : 0
master:782:816 [0] NCCL INFO Channel 03/32 : 0
master:782:816 [0] NCCL INFO Channel 04/32 : 0
master:782:816 [0] NCCL INFO Channel 05/32 : 0
master:782:816 [0] NCCL INFO Channel 06/32 : 0
master:782:816 [0] NCCL INFO Channel 07/32 : 0
master:782:816 [0] NCCL INFO Channel 08/32 : 0
master:782:816 [0] NCCL INFO Channel 09/32 : 0
master:782:816 [0] NCCL INFO Channel 10/32 : 0
master:782:816 [0] NCCL INFO Channel 11/32 : 0
master:782:816 [0] NCCL INFO Channel 12/32 : 0
master:782:816 [0] NCCL INFO Channel 13/32 : 0
master:782:816 [0] NCCL INFO Channel 14/32 : 0
master:782:816 [0] NCCL INFO Channel 15/32 : 0
master:782:816 [0] NCCL INFO Channel 16/32 : 0
master:782:816 [0] NCCL INFO Channel 17/32 : 0
master:782:816 [0] NCCL INFO Channel 18/32 : 0
master:782:816 [0] NCCL INFO Channel 19/32 : 0
master:782:816 [0] NCCL INFO Channel 20/32 : 0
master:782:816 [0] NCCL INFO Channel 21/32 : 0
master:782:816 [0] NCCL INFO Channel 22/32 : 0
master:782:816 [0] NCCL INFO Channel 23/32 : 0
master:782:816 [0] NCCL INFO Channel 24/32 : 0
master:782:816 [0] NCCL INFO Channel 25/32 : 0
master:782:816 [0] NCCL INFO Channel 26/32 : 0
master:782:816 [0] NCCL INFO Channel 27/32 : 0
master:782:816 [0] NCCL INFO Channel 28/32 : 0
master:782:816 [0] NCCL INFO Channel 29/32 : 0
master:782:816 [0] NCCL INFO Channel 30/32 : 0
master:782:816 [0] NCCL INFO Channel 31/32 : 0
master:782:816 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0x8a8cc0 nRanks 01 busId 9f000
master:782:816 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:782:816 [0] NCCL INFO P2P Chunksize set to 131072
master:782:816 [0] NCCL INFO Connected all rings comm 0x8a8cc0 nRanks 01 busId 9f000
master:782:816 [0] NCCL INFO Connected all trees
master:782:816 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:782:816 [0] NCCL INFO Init config for nccl_context_test: 0
master:782:816 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:782:816 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x2a9de5f131333451 localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 272.8 0.03 0.00 0 0.17 44.67 0.00 0
15236 3809 float sum -1 1336.3 0.01 0.00 0 0.16 93.19 0.00 0
30472 7618 float sum -1 4.38 6.96 0.00 0 0.17 184.12 0.00 0
60944 15236 float sum -1 81.28 0.75 0.00 0 0.16 371.61 0.00 0
121888 30472 float sum -1 6.40 19.06 0.00 0 0.17 738.72 0.00 0
243776 60944 float sum -1 4.50 54.11 0.00 0 0.16 1509.45 0.00 0
487552 121888 float sum -1 765.5 0.64 0.00 0 0.17 2954.86 0.00 0
975104 243776 float sum -1 55.02 17.72 0.00 0 0.16 5963.94 0.00 0
1950208 487552 float sum -1 687.0 2.84 0.00 0 0.16 11891.51 0.00 0
3900416 975104 float sum -1 1924.9 2.03 0.00 0 0.16 24002.56 0.00 0
7800832 1950208 float sum -1 2250.7 3.47 0.00 0 0.16 47566.05 0.00 0
15601664 3900416 float sum -1 1429.8 10.91 0.00 0 0.16 95715.73 0.00 0
31203328 7800832 float sum -1 5152.1 6.06 0.00 0 0.16 190264.20 0.00 0
62406656 15601664 float sum -1 7107.6 8.78 0.00 0 0.16 384040.96 0.00 0
124813312 31203328 float sum -1 5949.0 20.98 0.00 0 0.16 763384.17 0.00 0
249626624 62406656 float sum -1 6837.0 36.51 0.00 0 0.16 1522113.56 0.00 0
499253248 124813312 float sum -1 11251 44.38 0.00 0 0.16 3044227.12 0.00 0
998506496 249626624 float sum -1 17819 56.04 0.00 0 0.16 6069948.30 0.00 0
master:782:782 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:782:782 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 783 on master device 0 [0000:9f:00.0] BW200
master:783:783 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:783:783 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:783:783 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:783:783 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:783:783 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:783:783 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:783:783 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:783:783 [0] NCCL INFO ROCr version 1.1
master:783:783 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:783:824 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:783:824 [0] NCCL INFO Using network IB
master:783:824 [0] NCCL INFO comm 0x23d3cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x724f8b19220de670 - Init START
master:783:824 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:783:824 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:783:824 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:783:824 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:783:824 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:783:824 [0] NCCL INFO Channel 00/32 : 0
master:783:824 [0] NCCL INFO Channel 01/32 : 0
master:783:824 [0] NCCL INFO Channel 02/32 : 0
master:783:824 [0] NCCL INFO Channel 03/32 : 0
master:783:824 [0] NCCL INFO Channel 04/32 : 0
master:783:824 [0] NCCL INFO Channel 05/32 : 0
master:783:824 [0] NCCL INFO Channel 06/32 : 0
master:783:824 [0] NCCL INFO Channel 07/32 : 0
master:783:824 [0] NCCL INFO Channel 08/32 : 0
master:783:824 [0] NCCL INFO Channel 09/32 : 0
master:783:824 [0] NCCL INFO Channel 10/32 : 0
master:783:824 [0] NCCL INFO Channel 11/32 : 0
master:783:824 [0] NCCL INFO Channel 12/32 : 0
master:783:824 [0] NCCL INFO Channel 13/32 : 0
master:783:824 [0] NCCL INFO Channel 14/32 : 0
master:783:824 [0] NCCL INFO Channel 15/32 : 0
master:783:824 [0] NCCL INFO Channel 16/32 : 0
master:783:824 [0] NCCL INFO Channel 17/32 : 0
master:783:824 [0] NCCL INFO Channel 18/32 : 0
master:783:824 [0] NCCL INFO Channel 19/32 : 0
master:783:824 [0] NCCL INFO Channel 20/32 : 0
master:783:824 [0] NCCL INFO Channel 21/32 : 0
master:783:824 [0] NCCL INFO Channel 22/32 : 0
master:783:824 [0] NCCL INFO Channel 23/32 : 0
master:783:824 [0] NCCL INFO Channel 24/32 : 0
master:783:824 [0] NCCL INFO Channel 25/32 : 0
master:783:824 [0] NCCL INFO Channel 26/32 : 0
master:783:824 [0] NCCL INFO Channel 27/32 : 0
master:783:824 [0] NCCL INFO Channel 28/32 : 0
master:783:824 [0] NCCL INFO Channel 29/32 : 0
master:783:824 [0] NCCL INFO Channel 30/32 : 0
master:783:824 [0] NCCL INFO Channel 31/32 : 0
master:783:824 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0x23d3cc0 nRanks 01 busId 9f000
master:783:824 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:783:824 [0] NCCL INFO P2P Chunksize set to 131072
master:783:824 [0] NCCL INFO Connected all rings comm 0x23d3cc0 nRanks 01 busId 9f000
master:783:824 [0] NCCL INFO Connected all trees
master:783:824 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:783:824 [0] NCCL INFO Init config for nccl_context_test: 0
master:783:824 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:783:824 [0] NCCL INFO comm 0x23d3cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x724f8b19220de670 localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 4.12 1.85 0.00 0 0.16 46.87 0.00 0
15236 3809 float sum -1 4.25 3.59 0.00 0 0.16 93.19 0.00 0
30472 7618 float sum -1 14.79 2.06 0.00 0 0.16 188.10 0.00 0
60944 15236 float sum -1 4.17 14.62 0.00 0 0.16 375.04 0.00 0
121888 30472 float sum -1 3.91 31.20 0.00 0 0.16 740.96 0.00 0
243776 60944 float sum -1 4.09 59.68 0.00 0 0.16 1504.79 0.00 0
487552 121888 float sum -1 33.47 14.57 0.00 0 0.16 2981.97 0.00 0
975104 243776 float sum -1 10.07 96.82 0.00 0 0.16 6019.16 0.00 0
1950208 487552 float sum -1 2233.8 0.87 0.00 0 0.16 12038.32 0.00 0
3900416 975104 float sum -1 131.8 29.60 0.00 0 0.16 23855.76 0.00 0
7800832 1950208 float sum -1 3572.0 2.18 0.00 0 0.16 47857.87 0.00 0
15601664 3900416 float sum -1 4244.6 3.68 0.00 0 0.16 96010.24 0.00 0
31203328 7800832 float sum -1 3314.8 9.41 0.00 0 0.16 189685.88 0.00 0
62406656 15601664 float sum -1 3913.5 15.95 0.00 0 0.16 385226.27 0.00 0
124813312 31203328 float sum -1 7858.3 15.88 0.00 0 0.16 765725.84 0.00 0
249626624 62406656 float sum -1 5494.7 45.43 0.00 0 0.16 1522113.56 0.00 0
499253248 124813312 float sum -1 10943 45.62 0.00 0 0.16 3081810.17 0.00 0
998506496 249626624 float sum -1 17957 55.61 0.00 0 0.16 6182702.76 0.00 0
master:783:783 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:783:783 [0] NCCL INFO comm 0x23d3cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 784 on master device 0 [0000:9f:00.0] BW200
master:784:784 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:784:784 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:784:784 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:784:784 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:784:784 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:784:784 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:784:784 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:784:784 [0] NCCL INFO ROCr version 1.1
master:784:784 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:784:811 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:784:811 [0] NCCL INFO Using network IB
master:784:811 [0] NCCL INFO comm 0x2432cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xa13d0bc47641bee9 - Init START
master:784:811 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:784:811 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:784:811 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:784:811 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:784:811 [0] NCCL INFO Setting affinity for GPU 0 to 0fff0000,00000000
master:784:811 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:784:811 [0] NCCL INFO Channel 00/32 : 0
master:784:811 [0] NCCL INFO Channel 01/32 : 0
master:784:811 [0] NCCL INFO Channel 02/32 : 0
master:784:811 [0] NCCL INFO Channel 03/32 : 0
master:784:811 [0] NCCL INFO Channel 04/32 : 0
master:784:811 [0] NCCL INFO Channel 05/32 : 0
master:784:811 [0] NCCL INFO Channel 06/32 : 0
master:784:811 [0] NCCL INFO Channel 07/32 : 0
master:784:811 [0] NCCL INFO Channel 08/32 : 0
master:784:811 [0] NCCL INFO Channel 09/32 : 0
master:784:811 [0] NCCL INFO Channel 10/32 : 0
master:784:811 [0] NCCL INFO Channel 11/32 : 0
master:784:811 [0] NCCL INFO Channel 12/32 : 0
master:784:811 [0] NCCL INFO Channel 13/32 : 0
master:784:811 [0] NCCL INFO Channel 14/32 : 0
master:784:811 [0] NCCL INFO Channel 15/32 : 0
master:784:811 [0] NCCL INFO Channel 16/32 : 0
master:784:811 [0] NCCL INFO Channel 17/32 : 0
master:784:811 [0] NCCL INFO Channel 18/32 : 0
master:784:811 [0] NCCL INFO Channel 19/32 : 0
master:784:811 [0] NCCL INFO Channel 20/32 : 0
master:784:811 [0] NCCL INFO Channel 21/32 : 0
master:784:811 [0] NCCL INFO Channel 22/32 : 0
master:784:811 [0] NCCL INFO Channel 23/32 : 0
master:784:811 [0] NCCL INFO Channel 24/32 : 0
master:784:811 [0] NCCL INFO Channel 25/32 : 0
master:784:811 [0] NCCL INFO Channel 26/32 : 0
master:784:811 [0] NCCL INFO Channel 27/32 : 0
master:784:811 [0] NCCL INFO Channel 28/32 : 0
master:784:811 [0] NCCL INFO Channel 29/32 : 0
master:784:811 [0] NCCL INFO Channel 30/32 : 0
master:784:811 [0] NCCL INFO Channel 31/32 : 0
master:784:811 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0x2432cc0 nRanks 01 busId 9f000
master:784:811 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:784:811 [0] NCCL INFO P2P Chunksize set to 131072
master:784:811 [0] NCCL INFO Connected all rings comm 0x2432cc0 nRanks 01 busId 9f000
master:784:811 [0] NCCL INFO Connected all trees
master:784:811 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:784:811 [0] NCCL INFO Init config for nccl_context_test: 0
master:784:811 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:784:811 [0] NCCL INFO comm 0x2432cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xa13d0bc47641bee9 localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 4.50 1.69 0.00 0 0.17 44.93 0.00 0
15236 3809 float sum -1 218.9 0.07 0.00 0 0.17 90.15 0.00 0
30472 7618 float sum -1 382.4 0.08 0.00 0 0.17 178.20 0.00 0
60944 15236 float sum -1 6.94 8.78 0.00 0 0.18 336.71 0.00 0
121888 30472 float sum -1 1242.0 0.10 0.00 0 0.17 719.10 0.00 0
243776 60944 float sum -1 1373.2 0.18 0.00 0 0.17 1442.46 0.00 0
487552 121888 float sum -1 15.72 31.01 0.00 0 0.17 2867.95 0.00 0
975104 243776 float sum -1 17.34 56.23 0.00 0 0.18 5432.33 0.00 0
1950208 487552 float sum -1 1293.9 1.51 0.00 0 0.17 11471.81 0.00 0
3900416 975104 float sum -1 1232.8 3.16 0.00 0 0.17 23079.38 0.00 0
7800832 1950208 float sum -1 1745.5 4.47 0.00 0 0.17 45752.68 0.00 0
15601664 3900416 float sum -1 5316.8 2.93 0.00 0 0.17 91774.49 0.00 0
31203328 7800832 float sum -1 1549.4 20.14 0.00 0 0.17 184090.43 0.00 0
62406656 15601664 float sum -1 2868.4 21.76 0.00 0 0.17 368180.86 0.00 0
124813312 31203328 float sum -1 6969.6 17.91 0.00 0 0.17 734195.95 0.00 0
249626624 62406656 float sum -1 5689.2 43.88 0.00 0 0.17 1481463.64 0.00 0
499253248 124813312 float sum -1 10995 45.41 0.00 0 0.17 2945446.89 0.00 0
998506496 249626624 float sum -1 17552 56.89 0.00 0 0.17 5890893.78 0.00 0
master:784:784 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:784:784 [0] NCCL INFO comm 0x2432cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 785 on master device 0 [0000:9f:00.0] BW200
master:785:785 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:785:785 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:785:785 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:785:785 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:785:785 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:785:785 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:785:785 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:785:785 [0] NCCL INFO ROCr version 1.1
master:785:785 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:785:826 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:785:826 [0] NCCL INFO Using network IB
master:785:826 [0] NCCL INFO comm 0xaf9cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x35d771c35e045c1b - Init START
master:785:826 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:785:826 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:785:826 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:785:826 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:785:826 [0] NCCL INFO Setting affinity for GPU 0 to f0000000,00000000
master:785:826 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:785:826 [0] NCCL INFO Channel 00/32 : 0
master:785:826 [0] NCCL INFO Channel 01/32 : 0
master:785:826 [0] NCCL INFO Channel 02/32 : 0
master:785:826 [0] NCCL INFO Channel 03/32 : 0
master:785:826 [0] NCCL INFO Channel 04/32 : 0
master:785:826 [0] NCCL INFO Channel 05/32 : 0
master:785:826 [0] NCCL INFO Channel 06/32 : 0
master:785:826 [0] NCCL INFO Channel 07/32 : 0
master:785:826 [0] NCCL INFO Channel 08/32 : 0
master:785:826 [0] NCCL INFO Channel 09/32 : 0
master:785:826 [0] NCCL INFO Channel 10/32 : 0
master:785:826 [0] NCCL INFO Channel 11/32 : 0
master:785:826 [0] NCCL INFO Channel 12/32 : 0
master:785:826 [0] NCCL INFO Channel 13/32 : 0
master:785:826 [0] NCCL INFO Channel 14/32 : 0
master:785:826 [0] NCCL INFO Channel 15/32 : 0
master:785:826 [0] NCCL INFO Channel 16/32 : 0
master:785:826 [0] NCCL INFO Channel 17/32 : 0
master:785:826 [0] NCCL INFO Channel 18/32 : 0
master:785:826 [0] NCCL INFO Channel 19/32 : 0
master:785:826 [0] NCCL INFO Channel 20/32 : 0
master:785:826 [0] NCCL INFO Channel 21/32 : 0
master:785:826 [0] NCCL INFO Channel 22/32 : 0
master:785:826 [0] NCCL INFO Channel 23/32 : 0
master:785:826 [0] NCCL INFO Channel 24/32 : 0
master:785:826 [0] NCCL INFO Channel 25/32 : 0
master:785:826 [0] NCCL INFO Channel 26/32 : 0
master:785:826 [0] NCCL INFO Channel 27/32 : 0
master:785:826 [0] NCCL INFO Channel 28/32 : 0
master:785:826 [0] NCCL INFO Channel 29/32 : 0
master:785:826 [0] NCCL INFO Channel 30/32 : 0
master:785:826 [0] NCCL INFO Channel 31/32 : 0
master:785:826 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xaf9cc0 nRanks 01 busId 9f000
master:785:826 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:785:826 [0] NCCL INFO P2P Chunksize set to 131072
master:785:826 [0] NCCL INFO Connected all rings comm 0xaf9cc0 nRanks 01 busId 9f000
master:785:826 [0] NCCL INFO Connected all trees
master:785:826 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:785:826 [0] NCCL INFO Init config for nccl_context_test: 0
master:785:826 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:785:826 [0] NCCL INFO comm 0xaf9cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x35d771c35e045c1b localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 4.14 1.84 0.00 0 0.18 42.91 0.00 0
15236 3809 float sum -1 1176.7 0.01 0.00 0 0.17 92.34 0.00 0
30472 7618 float sum -1 6.70 4.55 0.00 0 0.16 186.94 0.00 0
60944 15236 float sum -1 7.59 8.03 0.00 0 0.16 373.89 0.00 0
121888 30472 float sum -1 1498.3 0.08 0.00 0 0.17 736.48 0.00 0
243776 60944 float sum -1 9.23 26.40 0.00 0 0.16 1504.79 0.00 0
487552 121888 float sum -1 5.72 85.27 0.00 0 0.17 2945.93 0.00 0
975104 243776 float sum -1 83.54 11.67 0.00 0 0.16 5927.68 0.00 0
1950208 487552 float sum -1 2418.3 0.81 0.00 0 0.17 11748.24 0.00 0
3900416 975104 float sum -1 22.35 174.50 0.00 0 0.17 23638.88 0.00 0
7800832 1950208 float sum -1 2612.1 2.99 0.00 0 0.17 47134.94 0.00 0
15601664 3900416 float sum -1 2013.6 7.75 0.00 0 0.17 93985.93 0.00 0
31203328 7800832 float sum -1 3635.6 8.58 0.00 0 0.17 188539.75 0.00 0
62406656 15601664 float sum -1 2971.7 21.00 0.00 0 0.17 375943.71 0.00 0
124813312 31203328 float sum -1 4725.7 26.41 0.00 0 0.17 754158.98 0.00 0
249626624 62406656 float sum -1 5890.6 42.38 0.00 0 0.17 1512888.63 0.00 0
499253248 124813312 float sum -1 12722 39.24 0.00 0 0.17 3025777.26 0.00 0
998506496 249626624 float sum -1 17775 56.17 0.00 0 0.17 5997036.01 0.00 0
master:785:785 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:785:785 [0] NCCL INFO comm 0xaf9cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 786 on master device 0 [0000:9f:00.0] BW200
master:786:786 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:786:786 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:786:786 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:786:786 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:786:786 [0] NCCL INFO ROCr version 1.1
master:786:786 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:786:828 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:786:828 [0] NCCL INFO Using network IB
master:786:828 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x48079883f045d3de - Init START
master:786:828 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:786:828 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:786:828 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:786:828 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:786:828 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:786:828 [0] NCCL INFO Channel 00/32 : 0
master:786:828 [0] NCCL INFO Channel 01/32 : 0
master:786:828 [0] NCCL INFO Channel 02/32 : 0
master:786:828 [0] NCCL INFO Channel 03/32 : 0
master:786:828 [0] NCCL INFO Channel 04/32 : 0
master:786:828 [0] NCCL INFO Channel 05/32 : 0
master:786:828 [0] NCCL INFO Channel 06/32 : 0
master:786:828 [0] NCCL INFO Channel 07/32 : 0
master:786:828 [0] NCCL INFO Channel 08/32 : 0
master:786:828 [0] NCCL INFO Channel 09/32 : 0
master:786:828 [0] NCCL INFO Channel 10/32 : 0
master:786:828 [0] NCCL INFO Channel 11/32 : 0
master:786:828 [0] NCCL INFO Channel 12/32 : 0
master:786:828 [0] NCCL INFO Channel 13/32 : 0
master:786:828 [0] NCCL INFO Channel 14/32 : 0
master:786:828 [0] NCCL INFO Channel 15/32 : 0
master:786:828 [0] NCCL INFO Channel 16/32 : 0
master:786:828 [0] NCCL INFO Channel 17/32 : 0
master:786:828 [0] NCCL INFO Channel 18/32 : 0
master:786:828 [0] NCCL INFO Channel 19/32 : 0
master:786:828 [0] NCCL INFO Channel 20/32 : 0
master:786:828 [0] NCCL INFO Channel 21/32 : 0
master:786:828 [0] NCCL INFO Channel 22/32 : 0
master:786:828 [0] NCCL INFO Channel 23/32 : 0
master:786:828 [0] NCCL INFO Channel 24/32 : 0
master:786:828 [0] NCCL INFO Channel 25/32 : 0
master:786:828 [0] NCCL INFO Channel 26/32 : 0
master:786:828 [0] NCCL INFO Channel 27/32 : 0
master:786:828 [0] NCCL INFO Channel 28/32 : 0
master:786:828 [0] NCCL INFO Channel 29/32 : 0
master:786:828 [0] NCCL INFO Channel 30/32 : 0
master:786:828 [0] NCCL INFO Channel 31/32 : 0
master:786:828 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xa9fcc0 nRanks 01 busId 9f000
master:786:828 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:786:828 [0] NCCL INFO P2P Chunksize set to 131072
master:786:828 [0] NCCL INFO Connected all rings comm 0xa9fcc0 nRanks 01 busId 9f000
master:786:828 [0] NCCL INFO Connected all trees
master:786:828 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:786:828 [0] NCCL INFO Init config for nccl_context_test: 0
master:786:828 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:786:828 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x48079883f045d3de localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 23.66 0.32 0.00 0 0.16 46.30 0.00 0
15236 3809 float sum -1 82.21 0.19 0.00 0 0.16 92.62 0.00 0
30472 7618 float sum -1 5.07 6.01 0.00 0 0.16 185.24 0.00 0
60944 15236 float sum -1 264.0 0.23 0.00 0 0.16 373.89 0.00 0
121888 30472 float sum -1 610.8 0.20 0.00 0 0.16 743.22 0.00 0
243776 60944 float sum -1 9.24 26.38 0.00 0 0.17 1442.46 0.00 0
487552 121888 float sum -1 17.59 27.71 0.00 0 0.16 2972.88 0.00 0
975104 243776 float sum -1 8.35 116.76 0.00 0 0.16 5963.94 0.00 0
1950208 487552 float sum -1 623.9 3.13 0.00 0 0.17 11819.44 0.00 0
3900416 975104 float sum -1 1861.1 2.10 0.00 0 0.16 23928.93 0.00 0
7800832 1950208 float sum -1 3060.8 2.55 0.00 0 0.16 47566.05 0.00 0
15601664 3900416 float sum -1 1961.2 7.96 0.00 0 0.16 95715.73 0.00 0
31203328 7800832 float sum -1 4824.6 6.47 0.00 0 0.16 190846.04 0.00 0
62406656 15601664 float sum -1 2331.5 26.77 0.00 0 0.16 380528.39 0.00 0
124813312 31203328 float sum -1 2811.0 44.40 0.00 0 0.16 758743.54 0.00 0
249626624 62406656 float sum -1 8424.8 29.63 0.00 0 0.16 1517487.08 0.00 0
499253248 124813312 float sum -1 10212 48.89 0.00 0 0.16 3062903.36 0.00 0
998506496 249626624 float sum -1 17520 56.99 0.00 0 0.16 6107073.37 0.00 0
master:786:786 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:786:786 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 787 on master device 0 [0000:9f:00.0] BW200
master:787:787 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:787:787 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:787:787 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:787:787 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:787:787 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:787:787 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:787:787 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:787:787 [0] NCCL INFO ROCr version 1.1
master:787:787 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:787:820 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:787:820 [0] NCCL INFO Using network IB
master:787:820 [0] NCCL INFO comm 0x2025cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x14b4ae64898456c8 - Init START
master:787:820 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:787:820 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:787:820 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:787:820 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:787:820 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:787:820 [0] NCCL INFO Channel 00/32 : 0
master:787:820 [0] NCCL INFO Channel 01/32 : 0
master:787:820 [0] NCCL INFO Channel 02/32 : 0
master:787:820 [0] NCCL INFO Channel 03/32 : 0
master:787:820 [0] NCCL INFO Channel 04/32 : 0
master:787:820 [0] NCCL INFO Channel 05/32 : 0
master:787:820 [0] NCCL INFO Channel 06/32 : 0
master:787:820 [0] NCCL INFO Channel 07/32 : 0
master:787:820 [0] NCCL INFO Channel 08/32 : 0
master:787:820 [0] NCCL INFO Channel 09/32 : 0
master:787:820 [0] NCCL INFO Channel 10/32 : 0
master:787:820 [0] NCCL INFO Channel 11/32 : 0
master:787:820 [0] NCCL INFO Channel 12/32 : 0
master:787:820 [0] NCCL INFO Channel 13/32 : 0
master:787:820 [0] NCCL INFO Channel 14/32 : 0
master:787:820 [0] NCCL INFO Channel 15/32 : 0
master:787:820 [0] NCCL INFO Channel 16/32 : 0
master:787:820 [0] NCCL INFO Channel 17/32 : 0
master:787:820 [0] NCCL INFO Channel 18/32 : 0
master:787:820 [0] NCCL INFO Channel 19/32 : 0
master:787:820 [0] NCCL INFO Channel 20/32 : 0
master:787:820 [0] NCCL INFO Channel 21/32 : 0
master:787:820 [0] NCCL INFO Channel 22/32 : 0
master:787:820 [0] NCCL INFO Channel 23/32 : 0
master:787:820 [0] NCCL INFO Channel 24/32 : 0
master:787:820 [0] NCCL INFO Channel 25/32 : 0
master:787:820 [0] NCCL INFO Channel 26/32 : 0
master:787:820 [0] NCCL INFO Channel 27/32 : 0
master:787:820 [0] NCCL INFO Channel 28/32 : 0
master:787:820 [0] NCCL INFO Channel 29/32 : 0
master:787:820 [0] NCCL INFO Channel 30/32 : 0
master:787:820 [0] NCCL INFO Channel 31/32 : 0
master:787:820 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0x2025cc0 nRanks 01 busId 9f000
master:787:820 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:787:820 [0] NCCL INFO P2P Chunksize set to 131072
master:787:820 [0] NCCL INFO Connected all rings comm 0x2025cc0 nRanks 01 busId 9f000
master:787:820 [0] NCCL INFO Connected all trees
master:787:820 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:787:820 [0] NCCL INFO Init config for nccl_context_test: 0
master:787:820 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:787:820 [0] NCCL INFO comm 0x2025cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x14b4ae64898456c8 localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 472.1 0.02 0.00 0 0.18 43.15 0.00 0
15236 3809 float sum -1 4.32 3.52 0.00 0 0.17 90.96 0.00 0
30472 7618 float sum -1 4.20 7.25 0.00 0 0.17 184.68 0.00 0
60944 15236 float sum -1 6.68 9.12 0.00 0 0.17 368.24 0.00 0
121888 30472 float sum -1 4.19 29.09 0.00 0 0.16 740.96 0.00 0
243776 60944 float sum -1 4.91 49.61 0.00 0 0.16 1490.98 0.00 0
487552 121888 float sum -1 6.15 79.30 0.00 0 0.16 2963.84 0.00 0
975104 243776 float sum -1 13.11 74.36 0.00 0 0.18 5493.54 0.00 0
1950208 487552 float sum -1 600.5 3.25 0.00 0 0.18 10987.09 0.00 0
3900416 975104 float sum -1 1739.3 2.24 0.00 0 0.16 23783.02 0.00 0
7800832 1950208 float sum -1 3433.1 2.27 0.00 0 0.16 47421.47 0.00 0
15601664 3900416 float sum -1 6355.6 2.45 0.00 0 0.16 94842.94 0.00 0
31203328 7800832 float sum -1 6227.7 5.01 0.00 0 0.18 175299.60 0.00 0
62406656 15601664 float sum -1 5716.3 10.92 0.00 0 0.16 380528.39 0.00 0
124813312 31203328 float sum -1 5857.8 21.31 0.00 0 0.16 765725.84 0.00 0
249626624 62406656 float sum -1 6172.8 40.44 0.00 0 0.17 1508317.97 0.00 0
499253248 124813312 float sum -1 10803 46.22 0.00 0 0.16 3062903.36 0.00 0
998506496 249626624 float sum -1 17859 55.91 0.00 0 0.16 6069948.30 0.00 0
master:787:787 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:787:787 [0] NCCL INFO comm 0x2025cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _ZN12_GLOBAL__N_113prepareInput2IfNS_9ReduceNilEEEvPT_lT0_iiml please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Pid 788 on master device 0 [0000:9f:00.0] BW200
master:788:788 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:788:788 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:788:788 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
master:788:788 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:788:788 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64
master:788:788 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!
master:788:788 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:788:788 [0] NCCL INFO ROCr version 1.1
master:788:788 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:788:808 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:788:808 [0] NCCL INFO Using network IB
master:788:808 [0] NCCL INFO comm 0xf26cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xa615ead25bf102d3 - Init START
master:788:808 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:788:808 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:788:808 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:788:808 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.
master:788:808 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:788:808 [0] NCCL INFO Channel 00/32 : 0
master:788:808 [0] NCCL INFO Channel 01/32 : 0
master:788:808 [0] NCCL INFO Channel 02/32 : 0
master:788:808 [0] NCCL INFO Channel 03/32 : 0
master:788:808 [0] NCCL INFO Channel 04/32 : 0
master:788:808 [0] NCCL INFO Channel 05/32 : 0
master:788:808 [0] NCCL INFO Channel 06/32 : 0
master:788:808 [0] NCCL INFO Channel 07/32 : 0
master:788:808 [0] NCCL INFO Channel 08/32 : 0
master:788:808 [0] NCCL INFO Channel 09/32 : 0
master:788:808 [0] NCCL INFO Channel 10/32 : 0
master:788:808 [0] NCCL INFO Channel 11/32 : 0
master:788:808 [0] NCCL INFO Channel 12/32 : 0
master:788:808 [0] NCCL INFO Channel 13/32 : 0
master:788:808 [0] NCCL INFO Channel 14/32 : 0
master:788:808 [0] NCCL INFO Channel 15/32 : 0
master:788:808 [0] NCCL INFO Channel 16/32 : 0
master:788:808 [0] NCCL INFO Channel 17/32 : 0
master:788:808 [0] NCCL INFO Channel 18/32 : 0
master:788:808 [0] NCCL INFO Channel 19/32 : 0
master:788:808 [0] NCCL INFO Channel 20/32 : 0
master:788:808 [0] NCCL INFO Channel 21/32 : 0
master:788:808 [0] NCCL INFO Channel 22/32 : 0
master:788:808 [0] NCCL INFO Channel 23/32 : 0
master:788:808 [0] NCCL INFO Channel 24/32 : 0
master:788:808 [0] NCCL INFO Channel 25/32 : 0
master:788:808 [0] NCCL INFO Channel 26/32 : 0
master:788:808 [0] NCCL INFO Channel 27/32 : 0
master:788:808 [0] NCCL INFO Channel 28/32 : 0
master:788:808 [0] NCCL INFO Channel 29/32 : 0
master:788:808 [0] NCCL INFO Channel 30/32 : 0
master:788:808 [0] NCCL INFO Channel 31/32 : 0
master:788:808 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xf26cc0 nRanks 01 busId 9f000
master:788:808 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:788:808 [0] NCCL INFO P2P Chunksize set to 131072
master:788:808 [0] NCCL INFO Connected all rings comm 0xf26cc0 nRanks 01 busId 9f000
master:788:808 [0] NCCL INFO Connected all trees
master:788:808 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:788:808 [0] NCCL INFO Init config for nccl_context_test: 0
master:788:808 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2
master:788:808 [0] NCCL INFO comm 0xf26cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0xa615ead25bf102d3 localSize 464 used 67142608 bytes - Init COMPLETE
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
7616 1904 float sum -1 9.67 0.79 0.00 0 0.17 45.47 0.00 0
15236 3809 float sum -1 9.98 1.53 0.00 0 0.17 90.69 0.00 0
30472 7618 float sum -1 13.63 2.24 0.00 0 0.17 179.25 0.00 0
60944 15236 float sum -1 1785.0 0.03 0.00 0 0.17 349.25 0.00 0
121888 30472 float sum -1 80.54 1.51 0.00 0 0.17 719.10 0.00 0
243776 60944 float sum -1 5.25 46.40 0.00 0 0.17 1446.74 0.00 0
487552 121888 float sum -1 1161.6 0.42 0.00 0 0.17 2867.95 0.00 0
975104 243776 float sum -1 7.81 124.90 0.00 0 0.17 5804.19 0.00 0
1950208 487552 float sum -1 16.09 121.21 0.00 0 0.17 11573.93 0.00 0
3900416 975104 float sum -1 2731.3 1.43 0.00 0 0.17 23147.87 0.00 0
7800832 1950208 float sum -1 1478.3 5.28 0.00 0 0.17 46158.77 0.00 0
15601664 3900416 float sum -1 515.1 30.29 0.00 0 0.17 92317.54 0.00 0
31203328 7800832 float sum -1 1436.0 21.73 0.00 0 0.17 184635.08 0.00 0
62406656 15601664 float sum -1 3433.0 18.18 0.00 0 0.17 369270.15 0.00 0
124813312 31203328 float sum -1 5122.9 24.36 0.00 0 0.17 734195.95 0.00 0
249626624 62406656 float sum -1 6770.1 36.87 0.00 0 0.17 1477080.62 0.00 0
499253248 124813312 float sum -1 14773 33.80 0.00 0 0.17 2954161.23 0.00 0
998506496 249626624 float sum -1 18711 53.37 0.00 0 0.17 5890893.78 0.00 0
master:788:788 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:788:788 [0] NCCL INFO comm 0xf26cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment