[xcl-lens] Support GDR info

8743b701 · one · 317eab29 · 8743b701
Commit 8743b701 authored Mar 07, 2026 by one
Show whitespace changes
Inline Side-by-side

Showing with 129 additions and 18 deletions

projects/xcl-lens/src/xcl_lens/parser/rccl.py projects/xcl-lens/src/xcl_lens/parser/rccl.py +129 -18

No files found.
--- a/projects/xcl-lens/src/xcl_lens/parser/rccl.py
+++ b/projects/xcl-lens/src/xcl_lens/parser/rccl.py
@@ -5,9 +5,12 @@ import pandas as pd

 class RcclLogParser:
    def __init__(self):
-        # (rank, content) -> None
+        # (host, rank, content) -> None
        self.log_entries = dict()

+        # Verbosity flag used by report sections
+        self._verbose = False
+
        # Pattern -> output string or as-is
        self.sys_patterns = {
            r"kernel version": None,
@@ -61,29 +64,44 @@ class RcclLogParser:
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

+        # Remember verbosity for sub-sections
+        self._verbose = verbose
+
        self._report_sys()
        self._report_user_envs()
+        self._report_gdr_info()
        self._report_graph_info()
-        self._report_channel_transport_info(verbose)
-        self._report_cl_transfers()
+        self._report_channel_transport_info()
+        self._report_collective_transfers()
        self._report_p2p_transfers()

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
-        """Extract and validate NCCL log lines with rank information"""
-        # Match lines that have a valid NCCL log format with rank
-        # Pattern: [rank] NCCL INFO/WARN/ERROR followed by content
+        """Extract NCCL log lines with host/rank information."""
+        # Preferred format:
+        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
+        # where <host> itself does NOT contain ':' (so we always stop at first colon)
+        match = re.search(
+            r"^([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
+            line,
+        )
+        if match:
+            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
+            self.log_entries[(host, rank, content)] = None
+            return
+
+        # Backward-compatible fallback for logs without host/pid/tid prefix
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
        if match:
            rank, content = int(match.group(1)), match.group(2)
-            self.log_entries[(rank, content)] = None
+            self.log_entries[("-", rank, content)] = None

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
        print("===> System Information:\n")
        reported = set()
-        for (_, content), _ in self.log_entries.items():
+        for (_, _, content), _ in self.log_entries.items():
            for pattern, out in self.sys_patterns.items():
                if re.search(pattern, content, re.IGNORECASE):
                    reported.add(out if out is not None else content)
@@ -97,7 +115,7 @@ class RcclLogParser:
        print("===> User-defined Environment Variables:\n")
        env_vars = {}
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
-        for (_, content), _ in self.log_entries.items():
+        for (_, _, content), _ in self.log_entries.items():
            m = pattern.search(content)
            if m:
                var_name, var_value = m.group(1), m.group(2)
@@ -111,6 +129,89 @@ class RcclLogParser:
                )
        print()

+    def _report_gdr_info(self):
+        """Parse and print GPU Direct RDMA (GDR) related information."""
+        print("===> GDR Info:\n")
+
+        # Part 1: NET/IB : GPU Direct RDMA Enabled for HCA <hca_no> '<hca_id>'
+        ib_rows = []
+        pattern_ib = re.compile(
+            r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'"
+        )
+        for (host, rank, content), _ in self.log_entries.items():
+            m = pattern_ib.search(content)
+            if m:
+                hca_no, hca_id = m.groups()
+                ib_rows.append(
+                    {
+                        "host": host,
+                        "rank": rank,
+                        "hca_no": int(hca_no),
+                        "hca_id": hca_id,
+                        "gdr": 1,
+                    }
+                )
+
+        print("  NET/IB : GPU Direct RDMA Enabled for:\n")
+        if ib_rows:
+            df_ib = pd.DataFrame(ib_rows)
+            df_ib.drop_duplicates(inplace=True)
+            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
+            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
+            if not self._verbose:
+                df_ib = df_ib.drop(columns=["host"])
+                df_ib.drop_duplicates(inplace=True)
+            print(df_ib.to_string(index=False))
+            print()
+        else:
+            print("  (No data found)\n")
+
+        # Part 2: GPU Direct RDMA Enabled for GPU <gpu> / HCA <hca_no> (distance <expr>), read <0|1>
+        gpu_rows = []
+        pattern_gpu = re.compile(
+            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
+        )
+        for (host, rank, content), _ in self.log_entries.items():
+            m = pattern_gpu.search(content)
+            if m:
+                gpu, hca_no, distance, read_flag = m.groups()
+                rw = "read" if read_flag == "1" else "write"
+                distance_expr = distance.strip()
+                # Split expressions like "4 <= 7" into distance and max_distance
+                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
+                if m_dist:
+                    distance_val, max_distance = m_dist.groups()
+                else:
+                    distance_val, max_distance = distance_expr, "-"
+                gpu_rows.append(
+                    {
+                        "host": host,
+                        "rank": rank,
+                        "gpu": gpu,
+                        "hca_no": int(hca_no),
+                        "distance": distance_val,
+                        "max_distance": max_distance,
+                        "r/w": rw,
+                    }
+                )
+
+        print("  GPU Direct RDMA Enabled for GPU:\n")
+        if gpu_rows:
+            df_gpu = pd.DataFrame(gpu_rows)
+            df_gpu.drop_duplicates(inplace=True)
+            df_gpu.sort_values(
+                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
+                inplace=True,
+            )
+            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
+            if not self._verbose:
+                df_gpu = df_gpu.drop(columns=["host"])
+                df_gpu.drop_duplicates(inplace=True)
+            print(df_gpu.to_string(index=False))
+            print()
+        else:
+            print("  (No data found)\n")
+
    def _extract_and_print(self, title, filter_func, fields, mandatory, sort_cols, move_rank=True):
        """
        Generic function to extract structured data from log lines and print as a table.
@@ -140,13 +241,13 @@ class RcclLogParser:
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
-        data = [(r, c) for (r, c), _ in self.log_entries.items() if filter_func(c)]
+        data = [(h, r, c) for (h, r, c), _ in self.log_entries.items() if filter_func(c)]
        if not data:
            print("  (No data found)\n")
            return

        # Create DataFrame and extract all fields using regex with validation
-        df = pd.DataFrame(data, columns=["rank", "raw_log"])
+        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
        for pattern, (col_name, val_pattern) in fields.items():
            # Extract field with strict value validation using word boundary
            df[col_name] = df["raw_log"].str.extract(
@@ -181,6 +282,9 @@ class RcclLogParser:
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
+
+        if not self._verbose and "host" in df.columns:
+            df = df.drop(columns=["host"])
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
@@ -190,17 +294,23 @@ class RcclLogParser:
        # Reorder columns for better readability
        if move_rank:
            cols = df.columns.tolist()
+            if "host" in cols:
+                cols.remove("host")
            cols.remove("rank")
            # Move protocol to second position if present
            if "protocol" in cols:
                cols.remove("protocol")
                cols.insert(0, "protocol")
-            # Always move rank to first position
+            # Always move host and rank to front
            cols.insert(0, "rank")
+            if "host" in cols:
+                cols.insert(0, "host")
            df = df[cols]

        # Sort the data
        sort_cols = [c for c in sort_cols if c in df.columns]
+        if "host" in df.columns and "host" not in sort_cols:
+            sort_cols.insert(0, "host")
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

@@ -222,7 +332,7 @@ class RcclLogParser:
            sort_cols=["rank", "Pattern"],
        )

-    def _report_channel_transport_info(self, verbose=False):
+    def _report_channel_transport_info(self):
        print("===> Channel Transport Info:\n")
        data = []

@@ -237,12 +347,13 @@ class RcclLogParser:
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

-        for (rank, content), _ in self.log_entries.items():
+        for (host, rank, content), _ in self.log_entries.items():
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
+                        "host": host,
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
@@ -257,14 +368,14 @@ class RcclLogParser:
            return

        df = pd.DataFrame(data)
-        df.sort_values(by=["rank", "channel", "sender", "receiver"], inplace=True)
-        if not verbose:
-            df = df.drop(columns=["channel", "sender", "receiver"])
+        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
+        if not self._verbose:
+            df = df.drop(columns=["host", "channel", "sender", "receiver"])
        df.drop_duplicates(inplace=True)
        print(df.to_string(index=False))
        print()

-    def _report_cl_transfers(self):
+    def _report_collective_transfers(self):
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,