Commit df9bba54 authored by one's avatar one
Browse files

[xcl-lens] Add channel transport info

parent 9f243032
...@@ -34,15 +34,15 @@ def main(): ...@@ -34,15 +34,15 @@ def main():
# Parse command line arguments # Parse command line arguments
parser = argparse.ArgumentParser(description="RCCL Log Parser Wrapper") parser = argparse.ArgumentParser(description="RCCL Log Parser Wrapper")
parser.add_argument( parser.add_argument(
"-v", "--verbose", action="store_true", help="Print raw log lines in addition to the report" "--raw", action="store_true", help="Print raw log lines in addition to the report"
) )
parser.add_argument("-v", "--verbose", action="store_true", help="Print verbose reports")
parser.add_argument( parser.add_argument(
"command", nargs=argparse.REMAINDER, help="The executable and arguments to run" "command", nargs=argparse.REMAINDER, help="The executable and arguments to run"
) )
args = parser.parse_args() args = parser.parse_args()
verbose = args.verbose
cmd = args.command cmd = args.command
# Check if command is provided # Check if command is provided
...@@ -72,14 +72,14 @@ def main(): ...@@ -72,14 +72,14 @@ def main():
# Collect all output lines # Collect all output lines
for line in process.stdout: for line in process.stdout:
if verbose: if args.raw:
print(f"{line}", end="", flush=True) print(f"{line}", end="", flush=True)
parser.collect(line) parser.collect(line)
process.wait() process.wait()
if rank == 0: if rank == 0:
parser.report() parser.report(verbose=args.verbose)
sys.exit(process.returncode) sys.exit(process.returncode)
except KeyboardInterrupt: except KeyboardInterrupt:
......
...@@ -17,6 +17,8 @@ class RcclLogParser: ...@@ -17,6 +17,8 @@ class RcclLogParser:
r"iommu": None, r"iommu": None,
r"Dmabuf feature disabled": "Dmabuf: disabled", r"Dmabuf feature disabled": "Dmabuf: disabled",
r"Disabled GDRCopy": "GDRCopy: disabled", r"Disabled GDRCopy": "GDRCopy: disabled",
r"Using network IB": "NET/IB: enabled",
r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
} }
# Pattern -> column with strict validation # Pattern -> column with strict validation
...@@ -55,13 +57,14 @@ class RcclLogParser: ...@@ -55,13 +57,14 @@ class RcclLogParser:
def collect(self, line): def collect(self, line):
self._preprocess_line(line) self._preprocess_line(line)
def report(self): def report(self, verbose=False):
print(" RCCL Log Parser Report ".center(80, "=")) print(" RCCL Log Parser Report ".center(80, "="))
print() print()
self._report_sys() self._report_sys()
self._report_user_envs() self._report_user_envs()
self._report_graph_info() self._report_graph_info()
self._report_channel_transport_info(verbose)
self._report_cl_transfers() self._report_cl_transfers()
self._report_p2p_transfers() self._report_p2p_transfers()
...@@ -74,7 +77,6 @@ class RcclLogParser: ...@@ -74,7 +77,6 @@ class RcclLogParser:
match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line) match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
if match: if match:
rank, content = int(match.group(1)), match.group(2) rank, content = int(match.group(1)), match.group(2)
if len(content) >= 20:
self.log_entries[(rank, content)] = None self.log_entries[(rank, content)] = None
def _report_sys(self): def _report_sys(self):
...@@ -94,7 +96,7 @@ class RcclLogParser: ...@@ -94,7 +96,7 @@ class RcclLogParser:
"""Search environment variables set by user""" """Search environment variables set by user"""
print("===> User-defined Environment Variables:\n") print("===> User-defined Environment Variables:\n")
env_vars = {} env_vars = {}
pattern = re.compile(r"(\w+)\s+set by environment to\s+(.+)") pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
for (_, content), _ in self.log_entries.items(): for (_, content), _ in self.log_entries.items():
m = pattern.search(content) m = pattern.search(content)
if m: if m:
...@@ -220,6 +222,48 @@ class RcclLogParser: ...@@ -220,6 +222,48 @@ class RcclLogParser:
sort_cols=["rank", "Pattern"], sort_cols=["rank", "Pattern"],
) )
def _report_channel_transport_info(self, verbose=False):
print("===> Channel Transport Info:\n")
data = []
# Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
# Group 1: channel (e.g., 00/0)
# Group 2: src (e.g., 2)
# Group 3: dst (e.g., 1)
# Group 4: type (e.g., send or receive, optional)
# Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
pattern = re.compile(
r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
)
for (rank, content), _ in self.log_entries.items():
m = pattern.search(content)
if m:
channel, src, dst, type_, transport = m.groups()
data.append(
{
"rank": rank,
"channel": channel,
"sender": int(src),
"receiver": int(dst),
"type": type_ if type_ else "-",
"transport": transport,
}
)
if not data:
print(" (No data found)\n")
return
df = pd.DataFrame(data)
df.sort_values(by=["rank", "channel", "sender", "receiver"], inplace=True)
if not verbose:
df = df.drop(columns=["channel", "sender", "receiver"])
df.drop_duplicates(inplace=True)
print(df.to_string(index=False))
print()
def _report_cl_transfers(self): def _report_cl_transfers(self):
self._extract_and_print( self._extract_and_print(
title="Unique Ring/Tree Transfers", title="Unique Ring/Tree Transfers",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment