rccl.py 18.8 KB
Newer Older
one's avatar
one committed
1
import re
2

one's avatar
one committed
3
4
5
6
import pandas as pd


class RcclLogParser:
one's avatar
one committed
7
    def __init__(self, verbose=False, hosts=None, ranks=None):
one's avatar
one committed
8
9
        # Deduplicated set of (host, rank, content) tuples
        self.log_entries: set[tuple[str, int, str]] = set()
one's avatar
one committed
10

one's avatar
one committed
11
        # Verbosity flag used by report sections
one's avatar
one committed
12
13
14
15
16
        self._verbose = verbose

        # Filters
        self._hosts = hosts if hosts is not None else []
        self._ranks = [int(r) for r in ranks] if ranks is not None else []
one's avatar
one committed
17

one's avatar
one committed
18
    def collect(self, line):
19
        self._preprocess_line(line)
one's avatar
one committed
20

one's avatar
one committed
21
    def report(self):
one's avatar
one committed
22
23
24
25
26
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        self._report_sys()
        self._report_user_envs()
27
        self._report_topo_mapping_info()
one's avatar
one committed
28
29
        self._report_net_ib_info()
        self._report_gdr_rw_info()
one's avatar
one committed
30
        self._report_graph_info()
one's avatar
one committed
31
32
        self._report_channel_transport_info()
        self._report_collective_transfers()
33
        self._report_p2p_transfers()
one's avatar
one committed
34
35
36
37

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
one's avatar
one committed
38
39
40
41
        """Extract NCCL log lines with host/rank information."""
        # Preferred format:
        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
        # where <host> itself does NOT contain ':' (so we always stop at first colon)
one's avatar
one committed
42
43
        match = re.match(
            r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
one's avatar
one committed
44
45
46
47
            line,
        )
        if match:
            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
one's avatar
one committed
48
49
50
51
            if self._hosts and host not in self._hosts:
                return
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
52
            self.log_entries.add((host, rank, content))
one's avatar
one committed
53
54
55
            return

        # Backward-compatible fallback for logs without host/pid/tid prefix
56
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
one's avatar
one committed
57
        if match:
58
            rank, content = int(match.group(1)), match.group(2)
one's avatar
one committed
59
60
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
61
            self.log_entries.add(("-", rank, content))
one's avatar
one committed
62
63
64

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        # Pattern -> output string or as-is
        sys_patterns = {
            r"kernel version": None,
            r"ROCr version": None,
            r"RCCL version": None,
            r"Librccl path": None,
            r"iommu": None,
            r"Dmabuf feature disabled": "Dmabuf: disabled",
            r"Disabled GDRCopy": "GDRCopy: disabled",
            r"Using network IB": "NET/IB: enabled",
            r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
            r"XDP is disabled": "XDP: disabled",
        }

one's avatar
one committed
79
        print("===> System Information:\n")
80
        reported = set()
one's avatar
one committed
81
        for _, _, content in self.log_entries:
82
            for pattern, out in sys_patterns.items():
83
                if re.search(pattern, content, re.IGNORECASE):
84
                    reported.add(out if out is not None else content)
one's avatar
one committed
85
                    break
86
        for line in sorted(reported):
one's avatar
one committed
87
88
89
90
91
92
            print(line)
        print()

    def _report_user_envs(self):
        """Search environment variables set by user"""
        print("===> User-defined Environment Variables:\n")
93
        env_vars = {}
one's avatar
one committed
94
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
one's avatar
one committed
95
        for _, _, content in self.log_entries:
96
            m = pattern.search(content)
one's avatar
one committed
97
            if m:
98
99
100
101
102
103
104
105
106
                var_name, var_value = m.group(1), m.group(2)
                env_vars.setdefault(var_name, set()).add(var_value)
        for key, values in sorted(env_vars.items()):
            if len(values) == 1:
                print(f"{key}: {next(iter(values))}")
            else:
                print(
                    f"{key}: {', '.join(sorted(values))} (WARNING: Different values across ranks)"
                )
one's avatar
one committed
107
108
        print()

one's avatar
one committed
109
110
111
    def _report_net_ib_info(self):
        """Parse and print NET/IB GPU Direct RDMA HCA information."""
        print("===> NET/IB Info:\n")
one's avatar
one committed
112
113

        ib_rows = []
one's avatar
one committed
114
        pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
one's avatar
one committed
115
        for host, rank, content in self.log_entries:
one's avatar
one committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
            m = pattern_ib.search(content)
            if m:
                hca_no, hca_id = m.groups()
                ib_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "hca_no": int(hca_no),
                        "hca_id": hca_id,
                        "gdr": 1,
                    }
                )

        if ib_rows:
            df_ib = pd.DataFrame(ib_rows)
            df_ib.drop_duplicates(inplace=True)
            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
            print(df_ib.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
139
140
141
142
    def _report_gdr_rw_info(self):
        """Parse and print GPU Direct RDMA read/write information."""
        print("===> GDR R/W Info:\n")

one's avatar
one committed
143
144
        gpu_rows = []
        pattern_gpu = re.compile(
one's avatar
one committed
145
146
            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
            r"HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
one's avatar
one committed
147
        )
one's avatar
one committed
148
        for host, rank, content in self.log_entries:
one's avatar
one committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
            m = pattern_gpu.search(content)
            if m:
                gpu, hca_no, distance, read_flag = m.groups()
                rw = "read" if read_flag == "1" else "write"
                distance_expr = distance.strip()
                # Split expressions like "4 <= 7" into distance and max_distance
                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
                if m_dist:
                    distance_val, max_distance = m_dist.groups()
                else:
                    distance_val, max_distance = distance_expr, "-"
                gpu_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "gpu": gpu,
                        "hca_no": int(hca_no),
                        "distance": distance_val,
                        "max_distance": max_distance,
                        "r/w": rw,
                    }
                )

        if gpu_rows:
            df_gpu = pd.DataFrame(gpu_rows)
            df_gpu.drop_duplicates(inplace=True)
            df_gpu.sort_values(
                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
                inplace=True,
            )
            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
            print(df_gpu.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
185
186
187
    def _extract_and_print(
        self, title, filter_func, fields, mandatory, verbose_cols, sort_cols, move_rank=True
    ):
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
            mandatory: List of column names that must not be NaN (drop rows missing these)
one's avatar
one committed
210
            verbose_cols: List of column names to keep when not verbose
211
212
213
214
215
216
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
one's avatar
one committed
217
        data = [(h, r, c) for h, r, c in self.log_entries if filter_func(c)]
218
219
        if not data:
            print("  (No data found)\n")
one's avatar
one committed
220
221
            return

222
        # Create DataFrame and extract all fields using regex with validation
one's avatar
one committed
223
        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
224
225
226
227
228
229
        for pattern, (col_name, val_pattern) in fields.items():
            # Extract field with strict value validation using word boundary
            df[col_name] = df["raw_log"].str.extract(
                rf"\b{pattern}\s+({val_pattern})", expand=False
            )

one's avatar
one committed
230
231
232
233
        # Drop verbose columns if not verbose
        if not self._verbose:
            df = df.drop(columns=verbose_cols, errors="ignore")

234
235
236
237
238
        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
239
240
241
242
243
244
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
245
246
247
248
249
250
251
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
252
        ]
253
        for col in numeric_columns:
254
255
256
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

257
258
259
260
261
262
263
264
265
266
267
268
269
        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
one's avatar
one committed
270
271
272
273
            target_order = ["host", "rank", "protocol"]
            leading_cols = [c for c in target_order if c in df.columns]
            remaining_cols = [c for c in df.columns if c not in leading_cols]
            df = df[leading_cols + remaining_cols]
274

275
        # Sort the data
276
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
277
278
        if "host" in df.columns and "host" not in sort_cols:
            sort_cols.insert(0, "host")
one's avatar
one committed
279
280
281
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

282
283
284
285
286
        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

287
        # Print the final table with NaN values replaced by "-"
one's avatar
one committed
288
289
        print(df.fillna("-").to_string(index=False))
        print()
290
291

    def _report_graph_info(self):
292
293
294
295
296
297
298
299
300
        # Pattern -> column with strict validation
        graph_info_fields = {
            r"Pattern": ("Pattern", r"\d+"),
            r"crossNic": ("crossNic", r"\d+"),
            r"nChannels": ("nChannels", r"\d+"),
            r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
            r"type": ("type", r"[\w/]+"),
            r"sameChannels": ("sameChannels", r"\d+"),
        }
301
302
303
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
304
            fields=graph_info_fields,
305
            mandatory=["Pattern"],
one's avatar
one committed
306
307
308
309
310
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "Pattern"],
        )

    def _report_collective_transfers(self):
311
312
313
314
315
316
317
318
319
320
321
        # Pattern -> column with strict validation
        cl_transfer_fields = {
            r"protocol": ("protocol", r"Simple|LL|LL128"),
            r"nbytes": ("nbytes", r"\d+"),
            r"algorithm": ("algorithm", r"Tree|Ring"),
            r"slicesteps": ("slicesteps", r"\d+"),
            r"nchannels": ("nchannels", r"\d+"),
            r"nloops": ("nloops", r"\d+"),
            r"nsteps": ("nsteps", r"\d+"),
            r"chunksize": ("chunksize", r"\d+"),
        }
one's avatar
one committed
322
323
324
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
325
            fields=cl_transfer_fields,
one's avatar
one committed
326
327
328
329
330
331
            mandatory=["protocol", "nbytes"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
332
333
334
335
336
337
338
339
340
341
        # Pattern -> column with strict validation
        p2p_fields = {
            r"p2p : rank": ("local", r"\d+"),
            r"send rank": ("send", r"\d+"),
            r"recv rank": ("recv", r"\d+"),
            r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
            r"p2pnChannels": ("p2pnChannels", r"\d+"),
            r"nChannelsMax": ("nChannelsMax", r"\d+"),
            r"protocol": ("protocol", r"Simple|LL|LL128"),
        }
one's avatar
one committed
342
343
344
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
345
            fields=p2p_fields,
one's avatar
one committed
346
347
348
            mandatory=["local", "send", "recv"],
            verbose_cols=["host", "rank", "local", "send", "recv"],
            sort_cols=["host", "rank", "protocol", "local", "send", "recv"],
349
350
        )

one's avatar
one committed
351
    def _report_channel_transport_info(self):
one's avatar
one committed
352
        print("===> Channel Transport Info:\n")
one's avatar
one committed
353
354
355
356
357
358

        if not self._verbose:
            print("  (Skipped because verbose mode is not enabled)")
            print()
            return

one's avatar
one committed
359
360
361
362
363
364
365
366
367
368
369
370
371
        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

one's avatar
one committed
372
        for host, rank, content in self.log_entries:
one's avatar
one committed
373
374
375
376
377
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
one's avatar
one committed
378
                        "host": host,
one's avatar
one committed
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
        df.drop_duplicates(inplace=True)
one's avatar
one committed
394
        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
one's avatar
one committed
395
396
        print(df.to_string(index=False))
        print()
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482

    def _report_aggregated_info(
        self,
        title: str,
        filter_func,
        patterns: list,
        accumulate_fields: set,
        col_order: list,
    ):
        """
        Generic method for sections where multiple log lines contribute different
        fields to a single per-(host, rank) record.

        Unlike _extract_and_print (one line → all fields), this aggregates
        across lines: each line may fill one field of the record.

        Args:
            title:             Section title.
            filter_func:       Pre-filter for log content (content -> bool).
            patterns:          List of (search_pattern, field, group_idx, literal).
            accumulate_fields: Fields that collect values across multiple lines (stored as set).
            col_order:         Preferred column display order.
        """
        print(f"===> {title}:\n")

        records: dict[tuple, dict] = {}

        for host, rank, content in self.log_entries:
            if not filter_func(content):
                continue
            for pattern, field, group_idx, literal in patterns:
                m = re.search(pattern, content, re.IGNORECASE)
                if m:
                    key = (host, rank)
                    rec = records.setdefault(key, {"host": host, "rank": rank})
                    value = (literal if group_idx is None else m.group(group_idx).strip()) or "-"
                    if field in accumulate_fields:
                        rec.setdefault(field, set()).add(value)
                    else:
                        rec[field] = value
                    break  # each line matches at most one pattern

        if not records:
            print("  (No data found)\n")
            return

        # Flatten accumulated sets → sorted string
        for rec in records.values():
            for field in accumulate_fields:
                if field in rec:
                    rec[field] = " | ".join(sorted(rec[field]))

        df = pd.DataFrame(list(records.values()))
        df.sort_values(by=["host", "rank"], inplace=True)

        ordered = [c for c in col_order if c in df.columns]
        remaining = [c for c in df.columns if c not in ordered]
        df = df[ordered + remaining]

        print(df.fillna("-").to_string(index=False))
        print()

    def _report_topo_mapping_info(self):
        # (search_pattern, field_name, capture_group_index_or_None, literal_value_or_None)
        # - capture_group_index: int → get regex group, None → use literal_value
        # - accumulate: True → this field may come from multiple lines, append rather than overwrite
        topo_mapping_patterns = [
            (r"No topo mapping file", "status", None, "no_file"),
            (r"environmental key word is (\S+)", "fingerprint", 1, None),
            (r"Loading topology mapping file (\S+)", "loaded", 1, None),
            (r"(?:parseing|parsing) topology mapping group[:\s]*(.*)", "parsed", 1, None),
            (r"skip topology mapping group:\s*([^,]+)", "skipped", 1, None),
        ]
        # Fields that should accumulate across multiple matching lines (per host/rank)
        topo_mapping_accumulate_fields = {"skipped", "parsed"}

        self._report_aggregated_info(
            title="Topology Mapping File Info",
            filter_func=lambda c: any(
                s in c.lower()
                for s in ("topo mapping", "topology mapping", "environmental key word")
            ),
            patterns=topo_mapping_patterns,
            accumulate_fields=topo_mapping_accumulate_fields,
            col_order=["host", "rank", "status", "fingerprint", "loaded", "parsed", "skipped"],
        )