rccl.py 21.6 KB
Newer Older
one's avatar
one committed
1
import re
2

one's avatar
one committed
3
4
5
6
import pandas as pd


class RcclLogParser:
one's avatar
one committed
7
    def __init__(self, verbose=False, hosts=None, ranks=None):
one's avatar
one committed
8
9
        # Deduplicated set of (host, rank, content) tuples
        self.log_entries: set[tuple[str, int, str]] = set()
one's avatar
one committed
10

one's avatar
one committed
11
        # Verbosity flag used by report sections
one's avatar
one committed
12
13
14
15
16
        self._verbose = verbose

        # Filters
        self._hosts = hosts if hosts is not None else []
        self._ranks = [int(r) for r in ranks] if ranks is not None else []
one's avatar
one committed
17

one's avatar
one committed
18
    def collect(self, line):
19
        self._preprocess_line(line)
one's avatar
one committed
20

one's avatar
one committed
21
    def report(self):
one's avatar
one committed
22
23
24
25
26
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        self._report_sys()
        self._report_user_envs()
27
        self._report_topo_mapping_info()
one's avatar
one committed
28
29
        self._report_net_ib_info()
        self._report_gdr_rw_info()
one's avatar
one committed
30
        self._report_graph_info()
one's avatar
one committed
31
32
        self._report_channel_transport_info()
        self._report_collective_transfers()
33
        self._report_p2p_transfers()
one's avatar
one committed
34
        self._report_pcie_mem_channel_info()
one's avatar
one committed
35
36
37
38

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
one's avatar
one committed
39
40
41
42
        """Extract NCCL log lines with host/rank information."""
        # Preferred format:
        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
        # where <host> itself does NOT contain ':' (so we always stop at first colon)
one's avatar
one committed
43
44
        match = re.match(
            r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
one's avatar
one committed
45
46
47
48
            line,
        )
        if match:
            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
one's avatar
one committed
49
50
51
52
            if self._hosts and host not in self._hosts:
                return
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
53
            self.log_entries.add((host, rank, content))
one's avatar
one committed
54
55
56
            return

        # Backward-compatible fallback for logs without host/pid/tid prefix
57
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
one's avatar
one committed
58
        if match:
59
            rank, content = int(match.group(1)), match.group(2)
one's avatar
one committed
60
61
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
62
            self.log_entries.add(("-", rank, content))
one's avatar
one committed
63
64

    def _report_sys(self):
one's avatar
one committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
        sys_specs = [
            (r"kernel version\s*:?\s*(.+)", "kernel version", 1, None),
            (r"ROCr version\s*:?\s*(.+)", "ROCr version", 1, None),
            (r"RCCL version\s*:?\s*(.+)", "RCCL version", 1, None),
            (r"Librccl path\s*:?\s*(.+)", "Librccl path", 1, None),
            (r'(Missing "iommu=pt".*|iommu.*)', "iommu", 1, None),
            (r"Dmabuf feature disabled", "Dmabuf", None, "disabled"),
            (r"Disabled GDRCopy", "GDRCopy", None, "disabled"),
            (r"Using network IB", "NET/IB", None, "enabled"),
            (r"NET/Plugin: Could not find: librccl-net.so", "NET/Plugin", None, "internal"),
            (r"XDP is disabled", "XDP", None, "disabled"),
        ]

        records = {}
        for host, rank, content in self.log_entries:
            for pattern, field, group_idx, literal in sys_specs:
                m = re.search(pattern, content, re.IGNORECASE)
                if not m:
                    continue
                value = literal if group_idx is None else m.group(group_idx).strip()
                records.setdefault(field, {}).setdefault((host, rank), set()).add(value)
                break

        self._print_consistency_report("System Information", records)
one's avatar
one committed
89
90

    def _report_user_envs(self):
one's avatar
one committed
91
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
one's avatar
one committed
92
93
        records = {}
        for host, rank, content in self.log_entries:
94
            m = pattern.search(content)
one's avatar
one committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
            if not m:
                continue
            var_name, var_value = m.group(1), m.group(2).strip()
            records.setdefault(var_name, {}).setdefault((host, rank), set()).add(var_value)

        self._print_consistency_report("User-defined Environment Variables", records)

    def _print_consistency_report(self, title, records):
        print(f"===> {title}:\n")

        if not records:
            print("  (No data found)\n")
            return

        for field in sorted(records):
            entries = records[field]
            values = sorted({value for field_values in entries.values() for value in field_values})
112
            if len(values) == 1:
one's avatar
one committed
113
114
115
                print(f"{field}: {values[0]}")
                continue

one's avatar
one committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
            host_values = {}
            host_has_rank_conflict = set()
            for (host, _rank), field_values in entries.items():
                host_values.setdefault(host, set()).update(field_values)
                if len(field_values) > 1:
                    host_has_rank_conflict.add(host)

            for host, field_values in host_values.items():
                if len(field_values) > 1:
                    host_has_rank_conflict.add(host)

            warning_scope = "nodes" if len(host_values) > 1 else "ranks"
            print(f"{field}: (WARNING: Different values across {warning_scope})")
            for host in sorted(host_values):
                if host not in host_has_rank_conflict:
                    joined = " | ".join(sorted(host_values[host]))
                    print(f"  {host}: {joined}")
                    continue

                for (entry_host, rank), field_values in sorted(entries.items()):
                    if entry_host != host:
                        continue
                    joined = " | ".join(sorted(field_values))
                    print(f"  {host} rank {rank}: {joined}")
one's avatar
one committed
140
141
        print()

one's avatar
one committed
142
143
144
    def _report_net_ib_info(self):
        """Parse and print NET/IB GPU Direct RDMA HCA information."""
        print("===> NET/IB Info:\n")
one's avatar
one committed
145
146

        ib_rows = []
one's avatar
one committed
147
        pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
one's avatar
one committed
148
        for host, rank, content in self.log_entries:
one's avatar
one committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
            m = pattern_ib.search(content)
            if m:
                hca_no, hca_id = m.groups()
                ib_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "hca_no": int(hca_no),
                        "hca_id": hca_id,
                        "gdr": 1,
                    }
                )

        if ib_rows:
            df_ib = pd.DataFrame(ib_rows)
            df_ib.drop_duplicates(inplace=True)
            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
            print(df_ib.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
172
173
174
175
    def _report_gdr_rw_info(self):
        """Parse and print GPU Direct RDMA read/write information."""
        print("===> GDR R/W Info:\n")

one's avatar
one committed
176
177
        gpu_rows = []
        pattern_gpu = re.compile(
one's avatar
one committed
178
179
            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
            r"HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
one's avatar
one committed
180
        )
one's avatar
one committed
181
        for host, rank, content in self.log_entries:
one's avatar
one committed
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
            m = pattern_gpu.search(content)
            if m:
                gpu, hca_no, distance, read_flag = m.groups()
                rw = "read" if read_flag == "1" else "write"
                distance_expr = distance.strip()
                # Split expressions like "4 <= 7" into distance and max_distance
                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
                if m_dist:
                    distance_val, max_distance = m_dist.groups()
                else:
                    distance_val, max_distance = distance_expr, "-"
                gpu_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "gpu": gpu,
                        "hca_no": int(hca_no),
                        "distance": distance_val,
                        "max_distance": max_distance,
                        "r/w": rw,
                    }
                )

        if gpu_rows:
            df_gpu = pd.DataFrame(gpu_rows)
            df_gpu.drop_duplicates(inplace=True)
            df_gpu.sort_values(
                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
                inplace=True,
            )
            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
            print(df_gpu.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
218
219
220
    def _extract_and_print(
        self, title, filter_func, fields, mandatory, verbose_cols, sort_cols, move_rank=True
    ):
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
one's avatar
one committed
242
243
244
                   A 3-item tuple (col_name, value_pattern, literal_value) is also
                   supported for literal/default columns. If value_pattern is None,
                   literal_value is assigned directly.
245
            mandatory: List of column names that must not be NaN (drop rows missing these)
one's avatar
one committed
246
            verbose_cols: List of column names to keep when not verbose
247
248
249
250
251
252
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
one's avatar
one committed
253
        data = [(h, r, c) for h, r, c in self.log_entries if filter_func(c)]
254
255
        if not data:
            print("  (No data found)\n")
one's avatar
one committed
256
257
            return

258
        # Create DataFrame and extract all fields using regex with validation
one's avatar
one committed
259
        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
one's avatar
one committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
        for pattern, field_spec in fields.items():
            if len(field_spec) == 2:
                col_name, val_pattern = field_spec
                # Extract field with strict value validation using word boundary
                df[col_name] = df["raw_log"].str.extract(
                    rf"\b{pattern}\s+({val_pattern})", expand=False
                )
            elif len(field_spec) == 3:
                col_name, val_pattern, literal_value = field_spec
                if val_pattern is None:
                    matched = df["raw_log"].str.contains(rf"\b{pattern}\b", regex=True)
                    df[col_name] = pd.Series(pd.NA, index=df.index, dtype="object")
                    df.loc[matched, col_name] = literal_value
                else:
                    df[col_name] = (
                        df["raw_log"]
                        .str.extract(rf"\b{pattern}\s+({val_pattern})", expand=False)
                        .fillna(literal_value)
                    )
            else:
                raise ValueError(
                    f"Invalid field spec for pattern {pattern!r}: expected 2 or 3 items"
                )
283

one's avatar
one committed
284
285
286
287
        # Drop verbose columns if not verbose
        if not self._verbose:
            df = df.drop(columns=verbose_cols, errors="ignore")

288
289
290
291
292
        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
293
294
295
296
297
298
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
299
300
301
302
303
304
305
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
one's avatar
one committed
306
307
308
            "connIndex",
            "collXhclNum",
            "p2pXhclNum",
309
        ]
310
        for col in numeric_columns:
311
312
313
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

314
315
316
317
318
319
320
321
322
323
324
325
326
        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
one's avatar
one committed
327
328
329
330
            target_order = ["host", "rank", "protocol"]
            leading_cols = [c for c in target_order if c in df.columns]
            remaining_cols = [c for c in df.columns if c not in leading_cols]
            df = df[leading_cols + remaining_cols]
331

332
        # Sort the data
333
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
334
335
        if "host" in df.columns and "host" not in sort_cols:
            sort_cols.insert(0, "host")
one's avatar
one committed
336
337
338
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

339
340
341
342
343
        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

344
        # Print the final table with NaN values replaced by "-"
one's avatar
one committed
345
346
        print(df.fillna("-").to_string(index=False))
        print()
347
348

    def _report_graph_info(self):
349
350
351
352
353
354
355
356
357
        # Pattern -> column with strict validation
        graph_info_fields = {
            r"Pattern": ("Pattern", r"\d+"),
            r"crossNic": ("crossNic", r"\d+"),
            r"nChannels": ("nChannels", r"\d+"),
            r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
            r"type": ("type", r"[\w/]+"),
            r"sameChannels": ("sameChannels", r"\d+"),
        }
358
359
360
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
361
            fields=graph_info_fields,
362
            mandatory=["Pattern"],
one's avatar
one committed
363
364
365
366
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "Pattern"],
        )

one's avatar
one committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
    def _report_pcie_mem_channel_info(self):
        pcie_mem_fields = {
            r"enable pcie mem channel": ("pcie_mem", None, "enabled"),
            r"connIndex": ("connIndex", r"\d+"),
            r"collXhclNum": ("collXhclNum", r"\d+"),
            r"p2pXhclNum": ("p2pXhclNum", r"\d+"),
        }
        self._extract_and_print(
            title="PCIe Mem Channel Info",
            filter_func=lambda c: "enable pcie mem channel" in c.lower(),
            fields=pcie_mem_fields,
            mandatory=["pcie_mem", "connIndex", "collXhclNum", "p2pXhclNum"],
            verbose_cols=[],
            sort_cols=["host", "rank", "connIndex", "collXhclNum", "p2pXhclNum"],
        )

one's avatar
one committed
383
    def _report_collective_transfers(self):
384
385
386
387
388
389
390
391
392
393
394
        # Pattern -> column with strict validation
        cl_transfer_fields = {
            r"protocol": ("protocol", r"Simple|LL|LL128"),
            r"nbytes": ("nbytes", r"\d+"),
            r"algorithm": ("algorithm", r"Tree|Ring"),
            r"slicesteps": ("slicesteps", r"\d+"),
            r"nchannels": ("nchannels", r"\d+"),
            r"nloops": ("nloops", r"\d+"),
            r"nsteps": ("nsteps", r"\d+"),
            r"chunksize": ("chunksize", r"\d+"),
        }
one's avatar
one committed
395
396
397
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
398
            fields=cl_transfer_fields,
one's avatar
one committed
399
400
401
402
403
404
            mandatory=["protocol", "nbytes"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
405
406
407
408
409
410
411
412
413
414
        # Pattern -> column with strict validation
        p2p_fields = {
            r"p2p : rank": ("local", r"\d+"),
            r"send rank": ("send", r"\d+"),
            r"recv rank": ("recv", r"\d+"),
            r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
            r"p2pnChannels": ("p2pnChannels", r"\d+"),
            r"nChannelsMax": ("nChannelsMax", r"\d+"),
            r"protocol": ("protocol", r"Simple|LL|LL128"),
        }
one's avatar
one committed
415
416
417
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
418
            fields=p2p_fields,
one's avatar
one committed
419
420
421
            mandatory=["local", "send", "recv"],
            verbose_cols=["host", "rank", "local", "send", "recv"],
            sort_cols=["host", "rank", "protocol", "local", "send", "recv"],
422
423
        )

one's avatar
one committed
424
    def _report_channel_transport_info(self):
one's avatar
one committed
425
        print("===> Channel Transport Info:\n")
one's avatar
one committed
426
427
428
429
430
431

        if not self._verbose:
            print("  (Skipped because verbose mode is not enabled)")
            print()
            return

one's avatar
one committed
432
433
434
435
436
437
438
439
440
441
442
443
444
        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

one's avatar
one committed
445
        for host, rank, content in self.log_entries:
one's avatar
one committed
446
447
448
449
450
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
one's avatar
one committed
451
                        "host": host,
one's avatar
one committed
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
        df.drop_duplicates(inplace=True)
one's avatar
one committed
467
        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
one's avatar
one committed
468
469
        print(df.to_string(index=False))
        print()
470

one's avatar
one committed
471
472
    def _report_topo_mapping_info(self):
        print("===> Topology Mapping File Info:\n")
473

one's avatar
one committed
474
475
476
477
478
479
480
481
        topo_mapping_patterns = [
            (r"No topo mapping file", "status", None, "no_file"),
            (r"environmental key word is (\S+)", "fingerprint", 1, None),
            (r"Loading topology mapping file (\S+)", "loaded", 1, None),
            (r"(?:parseing|parsing) topology mapping group[:\s]*(.*)", "parsed", 1, None),
            (r"skip topology mapping group:\s*([^,]+)", "skipped", 1, None),
        ]
        records = {}
482
483

        for host, rank, content in self.log_entries:
one's avatar
one committed
484
485
486
487
            if not any(
                s in content.lower()
                for s in ("topo mapping", "topology mapping", "environmental key word")
            ):
488
                continue
one's avatar
one committed
489
            for pattern, field, group_idx, literal in topo_mapping_patterns:
490
                m = re.search(pattern, content, re.IGNORECASE)
one's avatar
one committed
491
492
493
494
495
496
497
498
499
                if not m:
                    continue
                rec = records.setdefault((host, rank), {"host": host, "rank": rank})
                value = (literal if group_idx is None else m.group(group_idx).strip()) or "-"
                if field in {"parsed", "skipped"}:
                    rec.setdefault(field, set()).add(value)
                else:
                    rec[field] = value
                break
500
501
502
503
504
505

        if not records:
            print("  (No data found)\n")
            return

        for rec in records.values():
one's avatar
one committed
506
            for field in ("parsed", "skipped"):
507
                if field in rec:
one's avatar
one committed
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
                    rec[field] = sorted(rec[field])
            if rec.get("status") == "no_file":
                continue
            if rec.get("parsed"):
                rec["status"] = "parsed"
            elif rec.get("skipped"):
                rec["status"] = "skipped"
            else:
                rec["status"] = "-"

        by_host = {}
        for (host, rank), rec in records.items():
            by_host.setdefault(host, []).append((rank, rec))

        field_order = ["status", "fingerprint", "loaded", "parsed", "skipped"]
        for host in sorted(by_host):
            print(host)
            host_records = sorted(by_host[host], key=lambda item: item[0])
            normalized = [
                {field: rec.get(field, "-") for field in field_order} for _, rec in host_records
            ]
            if len({repr(item) for item in normalized}) == 1:
                self._print_topo_record(normalized[0], indent="  ")
                print()
                continue
533

one's avatar
one committed
534
535
536
537
            for rank, rec in host_records:
                print(f"  rank {rank}")
                self._print_topo_record(rec, indent="    ")
            print()
538

one's avatar
one committed
539
540
541
542
543
544
545
546
547
548
    def _print_topo_record(self, record, indent):
        for field in ("status", "fingerprint", "loaded"):
            if field in record:
                print(f"{indent}{field}: {record[field]}")
        for field in ("parsed", "skipped"):
            if field not in record:
                continue
            print(f"{indent}{field}:")
            for value in record[field]:
                print(f"{indent}  {value}")