Commit df9bba54 authored by one's avatar one
Browse files

[xcl-lens] Add channel transport info

parent 9f243032
......@@ -34,15 +34,15 @@ def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description="RCCL Log Parser Wrapper")
parser.add_argument(
"-v", "--verbose", action="store_true", help="Print raw log lines in addition to the report"
"--raw", action="store_true", help="Print raw log lines in addition to the report"
)
parser.add_argument("-v", "--verbose", action="store_true", help="Print verbose reports")
parser.add_argument(
"command", nargs=argparse.REMAINDER, help="The executable and arguments to run"
)
args = parser.parse_args()
verbose = args.verbose
cmd = args.command
# Check if command is provided
......@@ -72,14 +72,14 @@ def main():
# Collect all output lines
for line in process.stdout:
if verbose:
if args.raw:
print(f"{line}", end="", flush=True)
parser.collect(line)
process.wait()
if rank == 0:
parser.report()
parser.report(verbose=args.verbose)
sys.exit(process.returncode)
except KeyboardInterrupt:
......
......@@ -17,6 +17,8 @@ class RcclLogParser:
r"iommu": None,
r"Dmabuf feature disabled": "Dmabuf: disabled",
r"Disabled GDRCopy": "GDRCopy: disabled",
r"Using network IB": "NET/IB: enabled",
r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
}
# Pattern -> column with strict validation
......@@ -55,13 +57,14 @@ class RcclLogParser:
def collect(self, line):
self._preprocess_line(line)
def report(self):
def report(self, verbose=False):
print(" RCCL Log Parser Report ".center(80, "="))
print()
self._report_sys()
self._report_user_envs()
self._report_graph_info()
self._report_channel_transport_info(verbose)
self._report_cl_transfers()
self._report_p2p_transfers()
......@@ -74,7 +77,6 @@ class RcclLogParser:
match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
if match:
rank, content = int(match.group(1)), match.group(2)
if len(content) >= 20:
self.log_entries[(rank, content)] = None
def _report_sys(self):
......@@ -94,7 +96,7 @@ class RcclLogParser:
"""Search environment variables set by user"""
print("===> User-defined Environment Variables:\n")
env_vars = {}
pattern = re.compile(r"(\w+)\s+set by environment to\s+(.+)")
pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
for (_, content), _ in self.log_entries.items():
m = pattern.search(content)
if m:
......@@ -220,6 +222,48 @@ class RcclLogParser:
sort_cols=["rank", "Pattern"],
)
def _report_channel_transport_info(self, verbose=False):
print("===> Channel Transport Info:\n")
data = []
# Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
# Group 1: channel (e.g., 00/0)
# Group 2: src (e.g., 2)
# Group 3: dst (e.g., 1)
# Group 4: type (e.g., send or receive, optional)
# Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
pattern = re.compile(
r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
)
for (rank, content), _ in self.log_entries.items():
m = pattern.search(content)
if m:
channel, src, dst, type_, transport = m.groups()
data.append(
{
"rank": rank,
"channel": channel,
"sender": int(src),
"receiver": int(dst),
"type": type_ if type_ else "-",
"transport": transport,
}
)
if not data:
print(" (No data found)\n")
return
df = pd.DataFrame(data)
df.sort_values(by=["rank", "channel", "sender", "receiver"], inplace=True)
if not verbose:
df = df.drop(columns=["channel", "sender", "receiver"])
df.drop_duplicates(inplace=True)
print(df.to_string(index=False))
print()
def _report_cl_transfers(self):
self._extract_and_print(
title="Unique Ring/Tree Transfers",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment