stdout 8.04 KB
Newer Older
wangkaixiong's avatar
init  
wangkaixiong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# nThreads: 1 nGpus: 1 nRanks: 1 minBytes: 7618 maxBytes: 1073741824 step: 2(factor) warmupIters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#   Rank  0 Pid    782 on     master device  0 [0000:9f:00.0] BW200
master:782:782 [0] NCCL INFO Bootstrap : Using ibs66f0:10.10.10.1<0>
master:782:782 [0] NCCL INFO NET/Plugin : Plugin load (librccl-net.so) returned 2 : librccl-net.so: cannot open shared object file: No such file or directory
master:782:782 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation

master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:121 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
master:782:782 [0] NCCL INFO Kernel version: 4.19.90-89.11.v2401.ky10.x86_64

master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:142 NCCL WARN Missing "iommu=pt" from kernel command line which can lead to system instablity or hang!

master:782:782 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:147 NCCL WARN Missing "HSA_FORCE_FINE_GRAIN_PCIE=1" from environment which can lead to low RCCL performance, system instablity or hang!
master:782:782 [0] NCCL INFO ROCr version 1.1
master:782:782 [0] NCCL INFO Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1
RCCL version 2.18.3+hip6.1 HEAD:037e9b3
master:782:816 [0] NCCL INFO NET/IB : Using [0]mlx5_1:1/IB [1]mlx5_2:1/IB [2]mlx5_3:1/IB [3]mlx5_4:1/IB [4]mlx5_6:1/RoCE [5]mlx5_7:1/IB [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB ibs66f0:10.10.10.1<0>
master:782:816 [0] NCCL INFO Using network IB
master:782:816 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x2a9de5f131333451 - Init START
master:782:816 [0] NCCL INFO /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/transport/net_ib.cc:323 -> 2
master:782:816 [0] NCCL INFO NCCL_TOPO_FILE set by environment to /data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml
master:782:816 [0] NCCL INFO rocm_smi_lib: version 2.8.0.0
master:782:816 [0] NCCL INFO NCCL_NET_GDR_READ set by environment to 1.

master:782:816 [0] /data/jenkins_workspace/workspace/rccl_release/build/hipify/src/init.cc:1299 NCCL WARN -hcugon- rank 0 localRanks 1 nRanks 1 invalid rank num
master:782:816 [0] NCCL INFO Channel 00/32 :    0
master:782:816 [0] NCCL INFO Channel 01/32 :    0
master:782:816 [0] NCCL INFO Channel 02/32 :    0
master:782:816 [0] NCCL INFO Channel 03/32 :    0
master:782:816 [0] NCCL INFO Channel 04/32 :    0
master:782:816 [0] NCCL INFO Channel 05/32 :    0
master:782:816 [0] NCCL INFO Channel 06/32 :    0
master:782:816 [0] NCCL INFO Channel 07/32 :    0
master:782:816 [0] NCCL INFO Channel 08/32 :    0
master:782:816 [0] NCCL INFO Channel 09/32 :    0
master:782:816 [0] NCCL INFO Channel 10/32 :    0
master:782:816 [0] NCCL INFO Channel 11/32 :    0
master:782:816 [0] NCCL INFO Channel 12/32 :    0
master:782:816 [0] NCCL INFO Channel 13/32 :    0
master:782:816 [0] NCCL INFO Channel 14/32 :    0
master:782:816 [0] NCCL INFO Channel 15/32 :    0
master:782:816 [0] NCCL INFO Channel 16/32 :    0
master:782:816 [0] NCCL INFO Channel 17/32 :    0
master:782:816 [0] NCCL INFO Channel 18/32 :    0
master:782:816 [0] NCCL INFO Channel 19/32 :    0
master:782:816 [0] NCCL INFO Channel 20/32 :    0
master:782:816 [0] NCCL INFO Channel 21/32 :    0
master:782:816 [0] NCCL INFO Channel 22/32 :    0
master:782:816 [0] NCCL INFO Channel 23/32 :    0
master:782:816 [0] NCCL INFO Channel 24/32 :    0
master:782:816 [0] NCCL INFO Channel 25/32 :    0
master:782:816 [0] NCCL INFO Channel 26/32 :    0
master:782:816 [0] NCCL INFO Channel 27/32 :    0
master:782:816 [0] NCCL INFO Channel 28/32 :    0
master:782:816 [0] NCCL INFO Channel 29/32 :    0
master:782:816 [0] NCCL INFO Channel 30/32 :    0
master:782:816 [0] NCCL INFO Channel 31/32 :    0
master:782:816 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 comm 0x8a8cc0 nRanks 01 busId 9f000
master:782:816 [0] NCCL INFO -hcugon- create sdma group queue rank:0 localRanks:1 nRanks:1 sdma copy is not enabled
master:782:816 [0] NCCL INFO P2P Chunksize set to 131072
master:782:816 [0] NCCL INFO Connected all rings comm 0x8a8cc0 nRanks 01 busId 9f000
master:782:816 [0] NCCL INFO Connected all trees
master:782:816 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 4 p2p channels per peer
master:782:816 [0] NCCL INFO Init config for nccl_context_test: 0
master:782:816 [0] NCCL INFO Maximum number of GPUs in any NUMA node: 2

master:782:816 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 commId 0x2a9de5f131333451 localSize 464 used 67142608 bytes - Init COMPLETE
#
#                                                              out-of-place                       in-place          
#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
        7616          1904     float     sum      -1    272.8    0.03    0.00      0     0.17   44.67    0.00      0
       15236          3809     float     sum      -1   1336.3    0.01    0.00      0     0.16   93.19    0.00      0
       30472          7618     float     sum      -1     4.38    6.96    0.00      0     0.17  184.12    0.00      0
       60944         15236     float     sum      -1    81.28    0.75    0.00      0     0.16  371.61    0.00      0
      121888         30472     float     sum      -1     6.40   19.06    0.00      0     0.17  738.72    0.00      0
      243776         60944     float     sum      -1     4.50   54.11    0.00      0     0.16  1509.45    0.00      0
      487552        121888     float     sum      -1    765.5    0.64    0.00      0     0.17  2954.86    0.00      0
      975104        243776     float     sum      -1    55.02   17.72    0.00      0     0.16  5963.94    0.00      0
     1950208        487552     float     sum      -1    687.0    2.84    0.00      0     0.16  11891.51    0.00      0
     3900416        975104     float     sum      -1   1924.9    2.03    0.00      0     0.16  24002.56    0.00      0
     7800832       1950208     float     sum      -1   2250.7    3.47    0.00      0     0.16  47566.05    0.00      0
    15601664       3900416     float     sum      -1   1429.8   10.91    0.00      0     0.16  95715.73    0.00      0
    31203328       7800832     float     sum      -1   5152.1    6.06    0.00      0     0.16  190264.20    0.00      0
    62406656      15601664     float     sum      -1   7107.6    8.78    0.00      0     0.16  384040.96    0.00      0
   124813312      31203328     float     sum      -1   5949.0   20.98    0.00      0     0.16  763384.17    0.00      0
   249626624      62406656     float     sum      -1   6837.0   36.51    0.00      0     0.16  1522113.56    0.00      0
   499253248     124813312     float     sum      -1    11251   44.38    0.00      0     0.16  3044227.12    0.00      0
   998506496     249626624     float     sum      -1    17819   56.04    0.00      0     0.16  6069948.30    0.00      0
master:782:782 [0] NCCL INFO -hcugon- commCleanup rank:0 sdmaCountEnable:0
master:782:782 [0] NCCL INFO comm 0x8a8cc0 rank 0 nranks 1 cudaDev 0 busId 9f000 - Destroy COMPLETE
# Errors with asterisks indicate errors that have exceeded the maximum threshold.
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 0 
#