rccl.py

import re

import pandas as pd


class RcclLogParser:
    def __init__(self, verbose=False, hosts=None, ranks=None):
        # Deduplicated set of (host, rank, content) tuples
        self.log_entries: set[tuple[str, int, str]] = set()

        # Verbosity flag used by report sections
        self._verbose = verbose

        # Filters
        self._hosts = hosts if hosts is not None else []
        self._ranks = [int(r) for r in ranks] if ranks is not None else []

    def collect(self, line):
        self._preprocess_line(line)

    def report(self):
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        self._report_sys()
        self._report_user_envs()
        self._report_topo_mapping_info()
        self._report_net_ib_info()
        self._report_gdr_rw_info()
        self._report_graph_info()
        self._report_channel_transport_info()
        self._report_collective_transfers()
        self._report_p2p_transfers()
        self._report_pcie_mem_channel_info()

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
        """Extract NCCL log lines with host/rank information."""
        # Preferred format:
        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
        # where <host> itself does NOT contain ':' (so we always stop at first colon)
        match = re.match(
            r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
            line,
        )
        if match:
            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
            if self._hosts and host not in self._hosts:
                return
            if self._ranks and rank not in self._ranks:
                return
            self.log_entries.add((host, rank, content))
            return

        # Backward-compatible fallback for logs without host/pid/tid prefix
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
        if match:
            rank, content = int(match.group(1)), match.group(2)
            if self._ranks and rank not in self._ranks:
                return
            self.log_entries.add(("-", rank, content))

    def _report_sys(self):
        sys_specs = [
            (r"kernel version\s*:?\s*(.+)", "kernel version", 1, None),
            (r"ROCr version\s*:?\s*(.+)", "ROCr version", 1, None),
            (r"RCCL version\s*:?\s*(.+)", "RCCL version", 1, None),
            (r"Librccl path\s*:?\s*(.+)", "Librccl path", 1, None),
            (r'(Missing "iommu=pt".*|iommu.*)', "iommu", 1, None),
            (r"Dmabuf feature disabled", "Dmabuf", None, "disabled"),
            (r"Disabled GDRCopy", "GDRCopy", None, "disabled"),
            (r"Using network IB", "NET/IB", None, "enabled"),
            (r"NET/Plugin: Could not find: librccl-net.so", "NET/Plugin", None, "internal"),
            (r"XDP is disabled", "XDP", None, "disabled"),
        ]

        records = {}
        for host, rank, content in self.log_entries:
            for pattern, field, group_idx, literal in sys_specs:
                m = re.search(pattern, content, re.IGNORECASE)
                if not m:
                    continue
                value = literal if group_idx is None else m.group(group_idx).strip()
                records.setdefault(field, {}).setdefault((host, rank), set()).add(value)
                break

        self._print_consistency_report("System Information", records)

    def _report_user_envs(self):
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
        records = {}
        for host, rank, content in self.log_entries:
            m = pattern.search(content)
            if not m:
                continue
            var_name, var_value = m.group(1), m.group(2).strip()
            records.setdefault(var_name, {}).setdefault((host, rank), set()).add(var_value)

        self._print_consistency_report("User-defined Environment Variables", records)

    def _print_consistency_report(self, title, records):
        print(f"===> {title}:\n")

        if not records:
            print("  (No data found)\n")
            return

        for field in sorted(records):
            entries = records[field]
            values = sorted({value for field_values in entries.values() for value in field_values})
            if len(values) == 1:
                print(f"{field}: {values[0]}")
                continue

            host_values = {}
            host_has_rank_conflict = set()
            for (host, _rank), field_values in entries.items():
                host_values.setdefault(host, set()).update(field_values)
                if len(field_values) > 1:
                    host_has_rank_conflict.add(host)

            for host, field_values in host_values.items():
                if len(field_values) > 1:
                    host_has_rank_conflict.add(host)

            warning_scope = "nodes" if len(host_values) > 1 else "ranks"
            print(f"{field}: (WARNING: Different values across {warning_scope})")
            for host in sorted(host_values):
                if host not in host_has_rank_conflict:
                    joined = " | ".join(sorted(host_values[host]))
                    print(f"  {host}: {joined}")
                    continue

                for (entry_host, rank), field_values in sorted(entries.items()):
                    if entry_host != host:
                        continue
                    joined = " | ".join(sorted(field_values))
                    print(f"  {host} rank {rank}: {joined}")
        print()

    def _report_net_ib_info(self):
        """Parse and print NET/IB GPU Direct RDMA HCA information."""
        print("===> NET/IB Info:\n")

        ib_rows = []
        pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
        for host, rank, content in self.log_entries:
            m = pattern_ib.search(content)
            if m:
                hca_no, hca_id = m.groups()
                ib_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "hca_no": int(hca_no),
                        "hca_id": hca_id,
                        "gdr": 1,
                    }
                )

        if ib_rows:
            df_ib = pd.DataFrame(ib_rows)
            df_ib.drop_duplicates(inplace=True)
            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
            print(df_ib.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

    def _report_gdr_rw_info(self):
        """Parse and print GPU Direct RDMA read/write information."""
        print("===> GDR R/W Info:\n")

        gpu_rows = []
        pattern_gpu = re.compile(
            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
            r"HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
        )
        for host, rank, content in self.log_entries:
            m = pattern_gpu.search(content)
            if m:
                gpu, hca_no, distance, read_flag = m.groups()
                rw = "read" if read_flag == "1" else "write"
                distance_expr = distance.strip()
                # Split expressions like "4 <= 7" into distance and max_distance
                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
                if m_dist:
                    distance_val, max_distance = m_dist.groups()
                else:
                    distance_val, max_distance = distance_expr, "-"
                gpu_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "gpu": gpu,
                        "hca_no": int(hca_no),
                        "distance": distance_val,
                        "max_distance": max_distance,
                        "r/w": rw,
                    }
                )

        if gpu_rows:
            df_gpu = pd.DataFrame(gpu_rows)
            df_gpu.drop_duplicates(inplace=True)
            df_gpu.sort_values(
                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
                inplace=True,
            )
            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
            print(df_gpu.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

    def _extract_and_print(
        self, title, filter_func, fields, mandatory, verbose_cols, sort_cols, move_rank=True
    ):
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
                   A 3-item tuple (col_name, value_pattern, literal_value) is also
                   supported for literal/default columns. If value_pattern is None,
                   literal_value is assigned directly.
            mandatory: List of column names that must not be NaN (drop rows missing these)
            verbose_cols: List of column names to keep when not verbose
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
        data = [(h, r, c) for h, r, c in self.log_entries if filter_func(c)]
        if not data:
            print("  (No data found)\n")
            return

        # Create DataFrame and extract all fields using regex with validation
        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
        for pattern, field_spec in fields.items():
            if len(field_spec) == 2:
                col_name, val_pattern = field_spec
                # Extract field with strict value validation using word boundary
                df[col_name] = df["raw_log"].str.extract(
                    rf"\b{pattern}\s+({val_pattern})", expand=False
                )
            elif len(field_spec) == 3:
                col_name, val_pattern, literal_value = field_spec
                if val_pattern is None:
                    matched = df["raw_log"].str.contains(rf"\b{pattern}\b", regex=True)
                    df[col_name] = pd.Series(pd.NA, index=df.index, dtype="object")
                    df.loc[matched, col_name] = literal_value
                else:
                    df[col_name] = (
                        df["raw_log"]
                        .str.extract(rf"\b{pattern}\s+({val_pattern})", expand=False)
                        .fillna(literal_value)
                    )
            else:
                raise ValueError(
                    f"Invalid field spec for pattern {pattern!r}: expected 2 or 3 items"
                )

        # Drop verbose columns if not verbose
        if not self._verbose:
            df = df.drop(columns=verbose_cols, errors="ignore")

        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
            "connIndex",
            "collXhclNum",
            "p2pXhclNum",
        ]
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
            target_order = ["host", "rank", "protocol"]
            leading_cols = [c for c in target_order if c in df.columns]
            remaining_cols = [c for c in df.columns if c not in leading_cols]
            df = df[leading_cols + remaining_cols]

        # Sort the data
        sort_cols = [c for c in sort_cols if c in df.columns]
        if "host" in df.columns and "host" not in sort_cols:
            sort_cols.insert(0, "host")
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

        # Print the final table with NaN values replaced by "-"
        print(df.fillna("-").to_string(index=False))
        print()

    def _report_graph_info(self):
        # Pattern -> column with strict validation
        graph_info_fields = {
            r"Pattern": ("Pattern", r"\d+"),
            r"crossNic": ("crossNic", r"\d+"),
            r"nChannels": ("nChannels", r"\d+"),
            r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
            r"type": ("type", r"[\w/]+"),
            r"sameChannels": ("sameChannels", r"\d+"),
        }
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
            fields=graph_info_fields,
            mandatory=["Pattern"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "Pattern"],
        )

    def _report_pcie_mem_channel_info(self):
        pcie_mem_fields = {
            r"enable pcie mem channel": ("pcie_mem", None, "enabled"),
            r"connIndex": ("connIndex", r"\d+"),
            r"collXhclNum": ("collXhclNum", r"\d+"),
            r"p2pXhclNum": ("p2pXhclNum", r"\d+"),
        }
        self._extract_and_print(
            title="PCIe Mem Channel Info",
            filter_func=lambda c: "enable pcie mem channel" in c.lower(),
            fields=pcie_mem_fields,
            mandatory=["pcie_mem", "connIndex", "collXhclNum", "p2pXhclNum"],
            verbose_cols=[],
            sort_cols=["host", "rank", "connIndex", "collXhclNum", "p2pXhclNum"],
        )

    def _report_collective_transfers(self):
        # Pattern -> column with strict validation
        cl_transfer_fields = {
            r"protocol": ("protocol", r"Simple|LL|LL128"),
            r"nbytes": ("nbytes", r"\d+"),
            r"algorithm": ("algorithm", r"Tree|Ring"),
            r"slicesteps": ("slicesteps", r"\d+"),
            r"nchannels": ("nchannels", r"\d+"),
            r"nloops": ("nloops", r"\d+"),
            r"nsteps": ("nsteps", r"\d+"),
            r"chunksize": ("chunksize", r"\d+"),
        }
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
            fields=cl_transfer_fields,
            mandatory=["protocol", "nbytes"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
        # Pattern -> column with strict validation
        p2p_fields = {
            r"p2p : rank": ("local", r"\d+"),
            r"send rank": ("send", r"\d+"),
            r"recv rank": ("recv", r"\d+"),
            r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
            r"p2pnChannels": ("p2pnChannels", r"\d+"),
            r"nChannelsMax": ("nChannelsMax", r"\d+"),
            r"protocol": ("protocol", r"Simple|LL|LL128"),
        }
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
            fields=p2p_fields,
            mandatory=["local", "send", "recv"],
            verbose_cols=["host", "rank", "local", "send", "recv"],
            sort_cols=["host", "rank", "protocol", "local", "send", "recv"],
        )

    def _report_channel_transport_info(self):
        print("===> Channel Transport Info:\n")

        if not self._verbose:
            print("  (Skipped because verbose mode is not enabled)")
            print()
            return

        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

        for host, rank, content in self.log_entries:
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
                        "host": host,
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
        df.drop_duplicates(inplace=True)
        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
        print(df.to_string(index=False))
        print()

    def _report_topo_mapping_info(self):
        print("===> Topology Mapping File Info:\n")

        topo_mapping_patterns = [
            (r"No topo mapping file", "status", None, "no_file"),
            (r"environmental key word is (\S+)", "fingerprint", 1, None),
            (r"Loading topology mapping file (\S+)", "loaded", 1, None),
            (r"(?:parseing|parsing) topology mapping group[:\s]*(.*)", "parsed", 1, None),
            (r"skip topology mapping group:\s*([^,]+)", "skipped", 1, None),
        ]
        records = {}

        for host, rank, content in self.log_entries:
            if not any(
                s in content.lower()
                for s in ("topo mapping", "topology mapping", "environmental key word")
            ):
                continue
            for pattern, field, group_idx, literal in topo_mapping_patterns:
                m = re.search(pattern, content, re.IGNORECASE)
                if not m:
                    continue
                rec = records.setdefault((host, rank), {"host": host, "rank": rank})
                value = (literal if group_idx is None else m.group(group_idx).strip()) or "-"
                if field in {"parsed", "skipped"}:
                    rec.setdefault(field, set()).add(value)
                else:
                    rec[field] = value
                break

        if not records:
            print("  (No data found)\n")
            return

        for rec in records.values():
            for field in ("parsed", "skipped"):
                if field in rec:
                    rec[field] = sorted(rec[field])
            if rec.get("status") == "no_file":
                continue
            if rec.get("parsed"):
                rec["status"] = "parsed"
            elif rec.get("skipped"):
                rec["status"] = "skipped"
            else:
                rec["status"] = "-"

        by_host = {}
        for (host, rank), rec in records.items():
            by_host.setdefault(host, []).append((rank, rec))

        field_order = ["status", "fingerprint", "loaded", "parsed", "skipped"]
        for host in sorted(by_host):
            print(host)
            host_records = sorted(by_host[host], key=lambda item: item[0])
            normalized = [
                {field: rec.get(field, "-") for field in field_order} for _, rec in host_records
            ]
            if len({repr(item) for item in normalized}) == 1:
                self._print_topo_record(normalized[0], indent="  ")
                print()
                continue

            for rank, rec in host_records:
                print(f"  rank {rank}")
                self._print_topo_record(rec, indent="    ")
            print()

    def _print_topo_record(self, record, indent):
        for field in ("status", "fingerprint", "loaded"):
            if field in record:
                print(f"{indent}{field}: {record[field]}")
        for field in ("parsed", "skipped"):
            if field not in record:
                continue
            print(f"{indent}{field}:")
            for value in record[field]:
                print(f"{indent}  {value}")