Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
a95f20e8
Commit
a95f20e8
authored
Mar 07, 2026
by
one
Browse files
[xcl-lens] Fix filters
parent
deed39a3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
15 deletions
+31
-15
projects/xcl-lens/src/xcl_lens/parser/rccl.py
projects/xcl-lens/src/xcl_lens/parser/rccl.py
+12
-11
projects/xcl-lens/src/xcl_lens/runner.py
projects/xcl-lens/src/xcl_lens/runner.py
+19
-4
No files found.
projects/xcl-lens/src/xcl_lens/parser/rccl.py
View file @
a95f20e8
...
...
@@ -70,7 +70,8 @@ class RcclLogParser:
self
.
_report_sys
()
self
.
_report_user_envs
()
self
.
_report_gdr_info
()
self
.
_report_net_ib_info
()
self
.
_report_gdr_rw_info
()
self
.
_report_graph_info
()
self
.
_report_channel_transport_info
()
self
.
_report_collective_transfers
()
...
...
@@ -83,8 +84,8 @@ class RcclLogParser:
# Preferred format:
# <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
# where <host> itself does NOT contain ':' (so we always stop at first colon)
match
=
re
.
sear
ch
(
r
"
^
([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)"
,
match
=
re
.
mat
ch
(
r
"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)"
,
line
,
)
if
match
:
...
...
@@ -136,11 +137,10 @@ class RcclLogParser:
)
print
()
def
_report_
gdr
_info
(
self
):
"""Parse and print GPU Direct RDMA
(GDR) related
information."""
print
(
"===>
GDR
Info:
\n
"
)
def
_report_
net_ib
_info
(
self
):
"""Parse and print
NET/IB
GPU Direct RDMA
HCA
information."""
print
(
"===>
NET/IB
Info:
\n
"
)
# Part 1: NET/IB : GPU Direct RDMA Enabled for HCA <hca_no> '<hca_id>'
ib_rows
=
[]
pattern_ib
=
re
.
compile
(
r
"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'"
)
for
(
host
,
rank
,
content
),
_
in
self
.
log_entries
.
items
():
...
...
@@ -157,7 +157,6 @@ class RcclLogParser:
}
)
print
(
" NET/IB : GPU Direct RDMA Enabled for:
\n
"
)
if
ib_rows
:
df_ib
=
pd
.
DataFrame
(
ib_rows
)
df_ib
.
drop_duplicates
(
inplace
=
True
)
...
...
@@ -168,7 +167,10 @@ class RcclLogParser:
else
:
print
(
" (No data found)
\n
"
)
# Part 2: GPU Direct RDMA Enabled for GPU <gpu> / HCA <hca_no> (distance <expr>), read <0|1>
def
_report_gdr_rw_info
(
self
):
"""Parse and print GPU Direct RDMA read/write information."""
print
(
"===> GDR R/W Info:
\n
"
)
gpu_rows
=
[]
pattern_gpu
=
re
.
compile
(
r
"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
...
...
@@ -198,7 +200,6 @@ class RcclLogParser:
}
)
print
(
" GPU Direct RDMA Enabled for GPU:
\n
"
)
if
gpu_rows
:
df_gpu
=
pd
.
DataFrame
(
gpu_rows
)
df_gpu
.
drop_duplicates
(
inplace
=
True
)
...
...
@@ -390,7 +391,7 @@ class RcclLogParser:
return
df
=
pd
.
DataFrame
(
data
)
df
.
sort_values
(
by
=
[
"host"
,
"rank"
,
"channel"
,
"sender"
,
"receiver"
],
inplace
=
True
)
df
.
drop_duplicates
(
inplace
=
True
)
df
.
sort_values
(
by
=
[
"host"
,
"rank"
,
"channel"
,
"sender"
,
"receiver"
],
inplace
=
True
)
print
(
df
.
to_string
(
index
=
False
))
print
()
projects/xcl-lens/src/xcl_lens/runner.py
View file @
a95f20e8
...
...
@@ -40,7 +40,13 @@ def run_with_input(
# Case 3: Execute as command
return
_execute_command
(
summary
=
summary
,
verbose
=
verbose
,
rank
=
rank
,
log_prefix
=
log_prefix
,
cmd
=
command
summary
=
summary
,
verbose
=
verbose
,
hosts
=
hosts
,
ranks
=
ranks
,
rank
=
rank
,
log_prefix
=
log_prefix
,
cmd
=
command
,
)
...
...
@@ -84,14 +90,23 @@ def _process_files(
return
0
def
_execute_command
(
*
,
summary
:
bool
,
verbose
:
bool
,
rank
:
int
,
log_prefix
:
str
,
cmd
:
list
[
str
]):
def
_execute_command
(
*
,
summary
:
bool
,
verbose
:
bool
,
hosts
:
list
[
str
]
|
None
,
ranks
:
list
[
str
]
|
None
,
rank
:
int
,
log_prefix
:
str
,
cmd
:
list
[
str
],
):
env
=
os
.
environ
.
copy
()
env
[
"NCCL_DEBUG"
]
=
"INFO"
env
[
"NCCL_DEBUG_SUBSYS"
]
=
"ALL"
print
(
f
"
{
log_prefix
}
[Wrapper] Running command:
{
' '
.
join
(
cmd
)
}
"
)
parser
=
RcclLogParser
()
parser
=
RcclLogParser
(
verbose
=
verbose
,
hosts
=
hosts
,
ranks
=
ranks
)
process
=
subprocess
.
Popen
(
cmd
,
env
=
env
,
...
...
@@ -110,6 +125,6 @@ def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str
process
.
wait
()
if
rank
==
0
:
parser
.
report
(
verbose
=
verbose
)
parser
.
report
()
return
process
.
returncode
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment