stdout 8.04 KB
Newer Older
wangkaixiong's avatar
init  
wangkaixiong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#   Rank  0 Pid    786 on     master device  0 [0000:9f:00.0] BW200
master:786:786 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:786:786 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:786:786 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation

master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:786:786 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64

master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!

master:786:786 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:786:786 [0] NCCL INFO ROCr version 1.1
master:786:786 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:786:828 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:786:828 [0] NCCL INFO Using network IB
master:786:828 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x48079883f045d3de - Init START
master:786:828 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:786:828 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:786:828 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:786:828 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.

master:786:828 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:786:828 [0] NCCL INFO Channel 00/32 :    0
master:786:828 [0] NCCL INFO Channel 01/32 :    0
master:786:828 [0] NCCL INFO Channel 02/32 :    0
master:786:828 [0] NCCL INFO Channel 03/32 :    0
master:786:828 [0] NCCL INFO Channel 04/32 :    0
master:786:828 [0] NCCL INFO Channel 05/32 :    0
master:786:828 [0] NCCL INFO Channel 06/32 :    0
master:786:828 [0] NCCL INFO Channel 07/32 :    0
master:786:828 [0] NCCL INFO Channel 08/32 :    0
master:786:828 [0] NCCL INFO Channel 09/32 :    0
master:786:828 [0] NCCL INFO Channel 10/32 :    0
master:786:828 [0] NCCL INFO Channel 11/32 :    0
master:786:828 [0] NCCL INFO Channel 12/32 :    0
master:786:828 [0] NCCL INFO Channel 13/32 :    0
master:786:828 [0] NCCL INFO Channel 14/32 :    0
master:786:828 [0] NCCL INFO Channel 15/32 :    0
master:786:828 [0] NCCL INFO Channel 16/32 :    0
master:786:828 [0] NCCL INFO Channel 17/32 :    0
master:786:828 [0] NCCL INFO Channel 18/32 :    0
master:786:828 [0] NCCL INFO Channel 19/32 :    0
master:786:828 [0] NCCL INFO Channel 20/32 :    0
master:786:828 [0] NCCL INFO Channel 21/32 :    0
master:786:828 [0] NCCL INFO Channel 22/32 :    0
master:786:828 [0] NCCL INFO Channel 23/32 :    0
master:786:828 [0] NCCL INFO Channel 24/32 :    0
master:786:828 [0] NCCL INFO Channel 25/32 :    0
master:786:828 [0] NCCL INFO Channel 26/32 :    0
master:786:828 [0] NCCL INFO Channel 27/32 :    0
master:786:828 [0] NCCL INFO Channel 28/32 :    0
master:786:828 [0] NCCL INFO Channel 29/32 :    0
master:786:828 [0] NCCL INFO Channel 30/32 :    0
master:786:828 [0] NCCL INFO Channel 31/32 :    0
master:786:828 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0xa9fcc0 nRanks 01 busId 9f000
master:786:828 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:786:828 [0] NCCL INFO P2P Chunksize set to 131072
master:786:828 [0] NCCL INFO Connected all rings comm 0xa9fcc0 nRanks 01 busId 9f000
master:786:828 [0] NCCL INFO Connected all trees
master:786:828 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:786:828 [0] NCCL INFO Init config for nccl_context_test: 0
master:786:828 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2

master:786:828 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x48079883f045d3de localSize 464 used 67142608 bytes - Init COMPLETE
#
#                                                              out-of-place                       in-place          
#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
        7616          1904     float     sum      -1    23.66    0.32    0.00      0     0.16   46.30    0.00      0
       15236          3809     float     sum      -1    82.21    0.19    0.00      0     0.16   92.62    0.00      0
       30472          7618     float     sum      -1     5.07    6.01    0.00      0     0.16  185.24    0.00      0
       60944         15236     float     sum      -1    264.0    0.23    0.00      0     0.16  373.89    0.00      0
      121888         30472     float     sum      -1    610.8    0.20    0.00      0     0.16  743.22    0.00      0
      243776         60944     float     sum      -1     9.24   26.38    0.00      0     0.17  1442.46    0.00      0
      487552        121888     float     sum      -1    17.59   27.71    0.00      0     0.16  2972.88    0.00      0
      975104        243776     float     sum      -1     8.35  116.76    0.00      0     0.16  5963.94    0.00      0
     1950208        487552     float     sum      -1    623.9    3.13    0.00      0     0.17  11819.44    0.00      0
     3900416        975104     float     sum      -1   1861.1    2.10    0.00      0     0.16  23928.93    0.00      0
     7800832       1950208     float     sum      -1   3060.8    2.55    0.00      0     0.16  47566.05    0.00      0
    15601664       3900416     float     sum      -1   1961.2    7.96    0.00      0     0.16  95715.73    0.00      0
    31203328       7800832     float     sum      -1   4824.6    6.47    0.00      0     0.16  190846.04    0.00      0
    62406656      15601664     float     sum      -1   2331.5   26.77    0.00      0     0.16  380528.39    0.00      0
   124813312      31203328     float     sum      -1   2811.0   44.40    0.00      0     0.16  758743.54    0.00      0
   249626624      62406656     float     sum      -1   8424.8   29.63    0.00      0     0.16  1517487.08    0.00      0
   499253248     124813312     float     sum      -1    10212   48.89    0.00      0     0.16  3062903.36    0.00      0
   998506496     249626624     float     sum      -1    17520   56.99    0.00      0     0.16  6107073.37    0.00      0
master:786:786 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:786:786 [0] NCCL INFO comm 0xa9fcc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 0 
#