rccl.py 15.2 KB
Newer Older
one's avatar
one committed
1
import re
2

one's avatar
one committed
3
4
5
6
7
import pandas as pd


class RcclLogParser:
    def __init__(self):
one's avatar
one committed
8
        # (host, rank, content) -> None
9
        self.log_entries = dict()
one's avatar
one committed
10

one's avatar
one committed
11
12
13
        # Verbosity flag used by report sections
        self._verbose = False

one's avatar
one committed
14
15
16
17
18
19
20
21
22
        # Pattern -> output string or as-is
        self.sys_patterns = {
            r"kernel version": None,
            r"ROCr version": None,
            r"RCCL version": None,
            r"Librccl path": None,
            r"iommu": None,
            r"Dmabuf feature disabled": "Dmabuf: disabled",
            r"Disabled GDRCopy": "GDRCopy: disabled",
one's avatar
one committed
23
24
            r"Using network IB": "NET/IB: enabled",
            r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
one's avatar
one committed
25
26
        }

27
        # Pattern -> column with strict validation
one's avatar
one committed
28
        self.graph_info_fields = {
29
30
31
32
33
34
            r"Pattern": ("Pattern", r"\d+"),
            r"crossNic": ("crossNic", r"\d+"),
            r"nChannels": ("nChannels", r"\d+"),
            r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
            r"type": ("type", r"[\w/]+"),
            r"sameChannels": ("sameChannels", r"\d+"),
one's avatar
one committed
35
36
        }

37
        # Pattern -> column with strict validation
38
        self.cl_transfer_fields = {
39
40
41
42
43
44
45
46
            r"protocol": ("protocol", r"Simple|LL|LL128"),
            r"nbytes": ("nbytes", r"\d+"),
            r"algorithm": ("algorithm", r"Tree|Ring"),
            r"slicesteps": ("slicesteps", r"\d+"),
            r"nchannels": ("nchannels", r"\d+"),
            r"nloops": ("nloops", r"\d+"),
            r"nsteps": ("nsteps", r"\d+"),
            r"chunksize": ("chunksize", r"\d+"),
one's avatar
one committed
47
48
        }

49
        # Pattern -> column with strict validation
50
        self.p2p_fields = {
51
52
53
54
55
56
57
            r"p2p : rank": ("local", r"\d+"),
            r"send rank": ("send", r"\d+"),
            r"recv rank": ("recv", r"\d+"),
            r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
            r"p2pnChannels": ("p2pnChannels", r"\d+"),
            r"nChannelsMax": ("nChannelsMax", r"\d+"),
            r"protocol": ("protocol", r"Simple|LL|LL128"),
58
59
        }

one's avatar
one committed
60
    def collect(self, line):
61
        self._preprocess_line(line)
one's avatar
one committed
62

one's avatar
one committed
63
    def report(self, verbose=False):
one's avatar
one committed
64
65
66
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

one's avatar
one committed
67
68
69
        # Remember verbosity for sub-sections
        self._verbose = verbose

one's avatar
one committed
70
71
        self._report_sys()
        self._report_user_envs()
one's avatar
one committed
72
        self._report_gdr_info()
one's avatar
one committed
73
        self._report_graph_info()
one's avatar
one committed
74
75
        self._report_channel_transport_info()
        self._report_collective_transfers()
76
        self._report_p2p_transfers()
one's avatar
one committed
77
78
79
80

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
one's avatar
one committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
        """Extract NCCL log lines with host/rank information."""
        # Preferred format:
        #   <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
        # where <host> itself does NOT contain ':' (so we always stop at first colon)
        match = re.search(
            r"^([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
            line,
        )
        if match:
            host, rank, content = match.group(1), int(match.group(2)), match.group(3)
            self.log_entries[(host, rank, content)] = None
            return

        # Backward-compatible fallback for logs without host/pid/tid prefix
95
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
one's avatar
one committed
96
        if match:
97
            rank, content = int(match.group(1)), match.group(2)
one's avatar
one committed
98
            self.log_entries[("-", rank, content)] = None
one's avatar
one committed
99
100
101
102

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
        print("===> System Information:\n")
103
        reported = set()
one's avatar
one committed
104
        for (_, _, content), _ in self.log_entries.items():
105
106
            for pattern, out in self.sys_patterns.items():
                if re.search(pattern, content, re.IGNORECASE):
107
                    reported.add(out if out is not None else content)
one's avatar
one committed
108
                    break
109
        for line in sorted(reported):
one's avatar
one committed
110
111
112
113
114
115
            print(line)
        print()

    def _report_user_envs(self):
        """Search environment variables set by user"""
        print("===> User-defined Environment Variables:\n")
116
        env_vars = {}
one's avatar
one committed
117
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
one's avatar
one committed
118
        for (_, _, content), _ in self.log_entries.items():
119
            m = pattern.search(content)
one's avatar
one committed
120
            if m:
121
122
123
124
125
126
127
128
129
                var_name, var_value = m.group(1), m.group(2)
                env_vars.setdefault(var_name, set()).add(var_value)
        for key, values in sorted(env_vars.items()):
            if len(values) == 1:
                print(f"{key}: {next(iter(values))}")
            else:
                print(
                    f"{key}: {', '.join(sorted(values))} (WARNING: Different values across ranks)"
                )
one's avatar
one committed
130
131
        print()

one's avatar
one committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
    def _report_gdr_info(self):
        """Parse and print GPU Direct RDMA (GDR) related information."""
        print("===> GDR Info:\n")

        # Part 1: NET/IB : GPU Direct RDMA Enabled for HCA <hca_no> '<hca_id>'
        ib_rows = []
        pattern_ib = re.compile(
            r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'"
        )
        for (host, rank, content), _ in self.log_entries.items():
            m = pattern_ib.search(content)
            if m:
                hca_no, hca_id = m.groups()
                ib_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "hca_no": int(hca_no),
                        "hca_id": hca_id,
                        "gdr": 1,
                    }
                )

        print("  NET/IB : GPU Direct RDMA Enabled for:\n")
        if ib_rows:
            df_ib = pd.DataFrame(ib_rows)
            df_ib.drop_duplicates(inplace=True)
            df_ib.sort_values(by=["host", "rank", "hca_no", "hca_id"], inplace=True)
            df_ib = df_ib[["host", "rank", "hca_no", "hca_id", "gdr"]]
            if not self._verbose:
one's avatar
one committed
162
                df_ib = df_ib.drop(columns=["host", "rank"])
one's avatar
one committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
                df_ib.drop_duplicates(inplace=True)
            print(df_ib.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

        # Part 2: GPU Direct RDMA Enabled for GPU <gpu> / HCA <hca_no> (distance <expr>), read <0|1>
        gpu_rows = []
        pattern_gpu = re.compile(
            r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*HCA\s+(\d+)\s*\(distance\s+([^)]*)\),\s*read\s+([01])"
        )
        for (host, rank, content), _ in self.log_entries.items():
            m = pattern_gpu.search(content)
            if m:
                gpu, hca_no, distance, read_flag = m.groups()
                rw = "read" if read_flag == "1" else "write"
                distance_expr = distance.strip()
                # Split expressions like "4 <= 7" into distance and max_distance
                m_dist = re.match(r"^([+-]?\d+)\s*<=\s*([+-]?\d+)$", distance_expr)
                if m_dist:
                    distance_val, max_distance = m_dist.groups()
                else:
                    distance_val, max_distance = distance_expr, "-"
                gpu_rows.append(
                    {
                        "host": host,
                        "rank": rank,
                        "gpu": gpu,
                        "hca_no": int(hca_no),
                        "distance": distance_val,
                        "max_distance": max_distance,
                        "r/w": rw,
                    }
                )

        print("  GPU Direct RDMA Enabled for GPU:\n")
        if gpu_rows:
            df_gpu = pd.DataFrame(gpu_rows)
            df_gpu.drop_duplicates(inplace=True)
            df_gpu.sort_values(
                by=["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"],
                inplace=True,
            )
            df_gpu = df_gpu[["host", "rank", "gpu", "hca_no", "distance", "max_distance", "r/w"]]
            if not self._verbose:
one's avatar
one committed
208
                df_gpu = df_gpu.drop(columns=["host", "rank"])
one's avatar
one committed
209
210
211
212
213
214
                df_gpu.drop_duplicates(inplace=True)
            print(df_gpu.to_string(index=False))
            print()
        else:
            print("  (No data found)\n")

one's avatar
one committed
215
    def _extract_and_print(self, title, filter_func, fields, mandatory, verbose_cols, sort_cols, move_rank=True):
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
            mandatory: List of column names that must not be NaN (drop rows missing these)
one's avatar
one committed
238
            verbose_cols: List of column names to keep when not verbose
239
240
241
242
243
244
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
one's avatar
one committed
245
        data = [(h, r, c) for (h, r, c), _ in self.log_entries.items() if filter_func(c)]
246
247
        if not data:
            print("  (No data found)\n")
one's avatar
one committed
248
249
            return

250
        # Create DataFrame and extract all fields using regex with validation
one's avatar
one committed
251
        df = pd.DataFrame(data, columns=["host", "rank", "raw_log"])
252
253
254
255
256
257
        for pattern, (col_name, val_pattern) in fields.items():
            # Extract field with strict value validation using word boundary
            df[col_name] = df["raw_log"].str.extract(
                rf"\b{pattern}\s+({val_pattern})", expand=False
            )

one's avatar
one committed
258
259
260
261
        # Drop verbose columns if not verbose
        if not self._verbose:
            df = df.drop(columns=verbose_cols, errors="ignore")

262
263
264
265
266
        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
267
268
269
270
271
272
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
273
274
275
276
277
278
279
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
280
        ]
281
        for col in numeric_columns:
282
283
284
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

285
286
287
288
289
290
291
292
293
294
295
296
297
        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
one's avatar
one committed
298
299
300
301
            target_order = ["host", "rank", "protocol"]
            leading_cols = [c for c in target_order if c in df.columns]
            remaining_cols = [c for c in df.columns if c not in leading_cols]
            df = df[leading_cols + remaining_cols]
302

303
        # Sort the data
304
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
305
306
        if "host" in df.columns and "host" not in sort_cols:
            sort_cols.insert(0, "host")
one's avatar
one committed
307
308
309
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

310
311
312
313
314
        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

315
        # Print the final table with NaN values replaced by "-"
one's avatar
one committed
316
317
        print(df.fillna("-").to_string(index=False))
        print()
318
319
320
321
322
323
324

    def _report_graph_info(self):
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
            fields=self.graph_info_fields,
            mandatory=["Pattern"],
one's avatar
one committed
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "Pattern"],
        )

    def _report_collective_transfers(self):
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
            fields=self.cl_transfer_fields,
            mandatory=["protocol", "nbytes"],
            verbose_cols=["host", "rank"],
            sort_cols=["host", "rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
            fields=self.p2p_fields,
            mandatory=["local", "send", "recv"],
            verbose_cols=["host", "rank", "local", "send", "recv"],
            sort_cols=["host", "rank", "protocol", "local", "send", "recv"],
347
348
        )

one's avatar
one committed
349
    def _report_channel_transport_info(self):
one's avatar
one committed
350
        print("===> Channel Transport Info:\n")
one's avatar
one committed
351
352
353
354
355
356

        if not self._verbose:
            print("  (Skipped because verbose mode is not enabled)")
            print()
            return

one's avatar
one committed
357
358
359
360
361
362
363
364
365
366
367
368
369
        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

one's avatar
one committed
370
        for (host, rank, content), _ in self.log_entries.items():
one's avatar
one committed
371
372
373
374
375
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
one's avatar
one committed
376
                        "host": host,
one's avatar
one committed
377
378
379
380
381
382
383
384
385
386
387
388
389
390
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
one's avatar
one committed
391
        df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
one's avatar
one committed
392
393
394
        df.drop_duplicates(inplace=True)
        print(df.to_string(index=False))
        print()