rccl.py 15 KB
Newer Older
one's avatar
one committed
1
import re
2
from typing import Final
3

one's avatar
one committed
4
5
6
7
import pandas as pd


class RcclLogParser:
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    # Pattern -> output string or as-is
    sys_patterns: Final = {
        r"kernel version": None,
        r"ROCr version": None,
        r"RCCL version": None,
        r"Librccl path": None,
        r"iommu": None,
        r"Dmabuf feature disabled": "Dmabuf: disabled",
        r"Disabled GDRCopy": "GDRCopy: disabled",
        r"Using network IB": "NET/IB: enabled",
        r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
    }

    # Pattern -> column with strict validation
    graph_info_fields: Final = {
        r"Pattern": ("Pattern", r"\d+"),
        r"crossNic": ("crossNic", r"\d+"),
        r"nChannels": ("nChannels", r"\d+"),
        r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
        r"type": ("type", r"[\w/]+"),
        r"sameChannels": ("sameChannels", r"\d+"),
    }

    # Pattern -> column with strict validation
    cl_transfer_fields: Final = {
        r"protocol": ("protocol", r"Simple|LL|LL128"),
        r"nbytes": ("nbytes", r"\d+"),
        r"algorithm": ("algorithm", r"Tree|Ring"),
        r"slicesteps": ("slicesteps", r"\d+"),
        r"nchannels": ("nchannels", r"\d+"),
        r"nloops": ("nloops", r"\d+"),
        r"nsteps": ("nsteps", r"\d+"),
        r"chunksize": ("chunksize", r"\d+"),
    }

    # Pattern -> column with strict validation
    p2p_fields: Final = {
        r"p2p : rank": ("local", r"\d+"),
        r"send rank": ("send", r"\d+"),
        r"recv rank": ("recv", r"\d+"),
        r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
        r"p2pnChannels": ("p2pnChannels", r"\d+"),
        r"nChannelsMax": ("nChannelsMax", r"\d+"),
        r"protocol": ("protocol", r"Simple|LL|LL128"),
    }

one's avatar
one committed
54
    def __init__(self, verbose=False, hosts=None, ranks=None):
one's avatar
one committed
55
56
        # Deduplicated set of (host, rank, content) tuples
        self.log_entries: set[tuple[str, int, str]] = set()
one's avatar
one committed
57

one's avatar
one committed
58
        # Verbosity flag used by report sections
one's avatar
one committed
59
60
61
62
63
        self._verbose = verbose

        # Filters
        self._hosts = hosts if hosts is not None else []
        self._ranks = [int(r) for r in ranks] if ranks is not None else []
one's avatar
one committed
64

one's avatar
one committed
65
    def collect(self, line):
66
        self._preprocess_line(line)
one's avatar
one committed
67

one's avatar
one committed
68
    def report(self):
one's avatar
one committed
69
70
71
72
73
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        self._report_sys()
        self._report_user_envs()
one's avatar
one committed
74
75
        self._report_net_ib_info()
        self._report_gdr_rw_info()
one's avatar
one committed
76
        self._report_graph_info()
one's avatar
one committed
77
78
        self._report_channel_transport_info()
        self._report_collective_transfers()
79
        self._report_p2p_transfers()
one's avatar
one committed
80
81
82
83

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
one's avatar
one committed
84
85
86
87
        """Extract NCCL log lines with host/rank information."""
        # Preferred format:
        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
        # where <host> itself does NOT contain ':' (so we always stop at first colon)
one's avatar
one committed
88
89
        match = re.match(
            r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
one's avatar
one committed
90
91
92
93
            line,
        )
        if match:
            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
one's avatar
one committed
94
95
96
97
            if self._hosts and host not in self._hosts:
                return
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
98
            self.log_entries.add((host, rank, content))
one's avatar
one committed
99
100
101
            return

        # Backward-compatible fallback for logs without host/pid/tid prefix
102
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
one's avatar
one committed
103
        if match:
104
            rank, content = int(match.group(1)), match.group(2)
one's avatar
one committed
105
106
            if self._ranks and rank not in self._ranks:
                return
one's avatar
one committed
107
            self.log_entries.add(("-", rank, content))
one's avatar
one committed
108
109
110
111

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
        print("===> System Information:\n")
112
        reported = set()
one's avatar
one committed
113
        for _, _, content in self.log_entries:
114
115
            for pattern, out in self.sys_patterns.items():
                if re.search(pattern, content, re.IGNORECASE):
116
                    reported.add(out if out is not None else content)
one's avatar
one committed
117
                    break
118
        for line in sorted(reported):
one's avatar
one committed
119
120
121
122
123
124
            print(line)
        print()

    def _report_user_envs(self):
        """Search environment variables set by user"""
        print("===> User-defined Environment Variables:\n")
125
        env_vars = {}
one's avatar
one committed
126
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
one's avatar
one committed
127
        for _, _, content in self.log_entries:
128
            m = pattern.search(content)
one's avatar
one committed
129
            if m:
130
131
132
133
134
135
136
137
138
                var_name, var_value = m.group(1), m.group(2)
                env_vars.setdefault(var_name, set()).add(var_value)
        for key, values in sorted(env_vars.items()):
            if len(values) == 1:
                print(f"{key}: {next(iter(values))}")
            else:
                print(
                    f"{key}: {', '.join(sorted(values))} (WARNING: Different values across ranks)"
                )
one's avatar
one committed
139
140
        print()

one's avatar
one committed
141
142
143
    def _report_net_ib_info(self):
        """Parse and print NET/IB GPU Direct RDMA HCA information."""
        print("===> NET/IB Info:\n")
one's avatar
one committed
144
145

        ib_rows = []
one's avatar
one committed
146
        pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
one's avatar
one committed
147
        for host, rank, content in self.log_entries:
one's avatar
one committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
            m = pattern_ib.search(content)
            if m:
                hca_no, hca_id = m.groups()
                ib_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "hca_no": int(hca_no),
                        "hca_id": hca_id,
                        "gdr": 1,
                    }
                )

        if ib_rows:
            df_ib = pd.DataFrame(ib_rows)
            df_ib.drop_duplicates(inplace=True)
            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
            print(df_ib.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
171
172
173
174
    def _report_gdr_rw_info(self):
        """Parse and print GPU Direct RDMA read/write information."""
        print("===> GDR R/W Info:\n")

one's avatar
one committed
175
176
        gpu_rows = []
        pattern_gpu = re.compile(
one's avatar
one committed
177
178
            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
            r"HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
one's avatar
one committed
179
        )
one's avatar
one committed
180
        for host, rank, content in self.log_entries:
one's avatar
one committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
            m = pattern_gpu.search(content)
            if m:
                gpu, hca_no, distance, read_flag = m.groups()
                rw = "read" if read_flag == "1" else "write"
                distance_expr = distance.strip()
                # Split expressions like "4 <= 7" into distance and max_distance
                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
                if m_dist:
                    distance_val, max_distance = m_dist.groups()
                else:
                    distance_val, max_distance = distance_expr, "-"
                gpu_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "gpu": gpu,
                        "hca_no": int(hca_no),
                        "distance": distance_val,
                        "max_distance": max_distance,
                        "r/w": rw,
                    }
                )

        if gpu_rows:
            df_gpu = pd.DataFrame(gpu_rows)
            df_gpu.drop_duplicates(inplace=True)
            df_gpu.sort_values(
                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
                inplace=True,
            )
            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
            print(df_gpu.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
217
218
219
    def _extract_and_print(
        self, title, filter_func, fields, mandatory, verbose_cols, sort_cols, move_rank=True
    ):
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
            mandatory: List of column names that must not be NaN (drop rows missing these)
one's avatar
one committed
242
            verbose_cols: List of column names to keep when not verbose
243
244
245
246
247
248
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
one's avatar
one committed
249
        data = [(h, r, c) for h, r, c in self.log_entries if filter_func(c)]
250
251
        if not data:
            print("  (No data found)\n")
one's avatar
one committed
252
253
            return

254
        # Create DataFrame and extract all fields using regex with validation
one's avatar
one committed
255
        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
256
257
258
259
260
261
        for pattern, (col_name, val_pattern) in fields.items():
            # Extract field with strict value validation using word boundary
            df[col_name] = df["raw_log"].str.extract(
                rf"\b{pattern}\s+({val_pattern})", expand=False
            )

one's avatar
one committed
262
263
264
265
        # Drop verbose columns if not verbose
        if not self._verbose:
            df = df.drop(columns=verbose_cols, errors="ignore")

266
267
268
269
270
        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
271
272
273
274
275
276
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
277
278
279
280
281
282
283
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
284
        ]
285
        for col in numeric_columns:
286
287
288
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

289
290
291
292
293
294
295
296
297
298
299
300
301
        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
one's avatar
one committed
302
303
304
305
            target_order = ["host", "rank", "protocol"]
            leading_cols = [c for c in target_order if c in df.columns]
            remaining_cols = [c for c in df.columns if c not in leading_cols]
            df = df[leading_cols + remaining_cols]
306

307
        # Sort the data
308
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
309
310
        if "host" in df.columns and "host" not in sort_cols:
            sort_cols.insert(0, "host")
one's avatar
one committed
311
312
313
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

314
315
316
317
318
        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

319
        # Print the final table with NaN values replaced by "-"
one's avatar
one committed
320
321
        print(df.fillna("-").to_string(index=False))
        print()
322
323
324
325
326
327
328

    def _report_graph_info(self):
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
            fields=self.graph_info_fields,
            mandatory=["Pattern"],
one's avatar
one committed
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "Pattern"],
        )

    def _report_collective_transfers(self):
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
            fields=self.cl_transfer_fields,
            mandatory=["protocol", "nbytes"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
            fields=self.p2p_fields,
            mandatory=["local", "send", "recv"],
            verbose_cols=["host", "rank", "local", "send", "recv"],
            sort_cols=["host", "rank", "protocol", "local", "send", "recv"],
351
352
        )

one's avatar
one committed
353
    def _report_channel_transport_info(self):
one's avatar
one committed
354
        print("===> Channel Transport Info:\n")
one's avatar
one committed
355
356
357
358
359
360

        if not self._verbose:
            print("  (Skipped because verbose mode is not enabled)")
            print()
            return

one's avatar
one committed
361
362
363
364
365
366
367
368
369
370
371
372
373
        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

one's avatar
one committed
374
        for host, rank, content in self.log_entries:
one's avatar
one committed
375
376
377
378
379
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
one's avatar
one committed
380
                        "host": host,
one's avatar
one committed
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
        df.drop_duplicates(inplace=True)
one's avatar
one committed
396
        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
one's avatar
one committed
397
398
        print(df.to_string(index=False))
        print()