rccl.py 10.7 KB
Newer Older
one's avatar
one committed
1
import re
2

one's avatar
one committed
3
4
5
6
7
import pandas as pd


class RcclLogParser:
    def __init__(self):
8
9
        # (rank, content) -> None
        self.log_entries = dict()
one's avatar
one committed
10
11
12
13
14
15
16
17
18
19

        # Pattern -> output string or as-is
        self.sys_patterns = {
            r"kernel version": None,
            r"ROCr version": None,
            r"RCCL version": None,
            r"Librccl path": None,
            r"iommu": None,
            r"Dmabuf feature disabled": "Dmabuf: disabled",
            r"Disabled GDRCopy": "GDRCopy: disabled",
one's avatar
one committed
20
21
            r"Using network IB": "NET/IB: enabled",
            r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal",
one's avatar
one committed
22
23
        }

24
        # Pattern -> column with strict validation
one's avatar
one committed
25
        self.graph_info_fields = {
26
27
28
29
30
31
            r"Pattern": ("Pattern", r"\d+"),
            r"crossNic": ("crossNic", r"\d+"),
            r"nChannels": ("nChannels", r"\d+"),
            r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
            r"type": ("type", r"[\w/]+"),
            r"sameChannels": ("sameChannels", r"\d+"),
one's avatar
one committed
32
33
        }

34
        # Pattern -> column with strict validation
35
        self.cl_transfer_fields = {
36
37
38
39
40
41
42
43
            r"protocol": ("protocol", r"Simple|LL|LL128"),
            r"nbytes": ("nbytes", r"\d+"),
            r"algorithm": ("algorithm", r"Tree|Ring"),
            r"slicesteps": ("slicesteps", r"\d+"),
            r"nchannels": ("nchannels", r"\d+"),
            r"nloops": ("nloops", r"\d+"),
            r"nsteps": ("nsteps", r"\d+"),
            r"chunksize": ("chunksize", r"\d+"),
one's avatar
one committed
44
45
        }

46
        # Pattern -> column with strict validation
47
        self.p2p_fields = {
48
49
50
51
52
53
54
            r"p2p : rank": ("local", r"\d+"),
            r"send rank": ("send", r"\d+"),
            r"recv rank": ("recv", r"\d+"),
            r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
            r"p2pnChannels": ("p2pnChannels", r"\d+"),
            r"nChannelsMax": ("nChannelsMax", r"\d+"),
            r"protocol": ("protocol", r"Simple|LL|LL128"),
55
56
        }

one's avatar
one committed
57
    def collect(self, line):
58
        self._preprocess_line(line)
one's avatar
one committed
59

one's avatar
one committed
60
    def report(self, verbose=False):
one's avatar
one committed
61
62
63
64
65
66
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        self._report_sys()
        self._report_user_envs()
        self._report_graph_info()
one's avatar
one committed
67
        self._report_channel_transport_info(verbose)
68
69
        self._report_cl_transfers()
        self._report_p2p_transfers()
one's avatar
one committed
70
71
72
73

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
74
75
76
77
        """Extract and validate NCCL log lines with rank information"""
        # Match lines that have a valid NCCL log format with rank
        # Pattern: [rank] NCCL INFO/WARN/ERROR followed by content
        match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
one's avatar
one committed
78
        if match:
79
            rank, content = int(match.group(1)), match.group(2)
one's avatar
one committed
80
            self.log_entries[(rank, content)] = None
one's avatar
one committed
81
82
83
84

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
        print("===> System Information:\n")
85
86
87
88
        reported = set()
        for (_, content), _ in self.log_entries.items():
            for pattern, out in self.sys_patterns.items():
                if re.search(pattern, content, re.IGNORECASE):
89
                    reported.add(out if out is not None else content)
one's avatar
one committed
90
                    break
91
        for line in sorted(reported):
one's avatar
one committed
92
93
94
95
96
97
            print(line)
        print()

    def _report_user_envs(self):
        """Search environment variables set by user"""
        print("===> User-defined Environment Variables:\n")
98
        env_vars = {}
one's avatar
one committed
99
        pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
100
101
        for (_, content), _ in self.log_entries.items():
            m = pattern.search(content)
one's avatar
one committed
102
            if m:
103
104
105
106
107
108
109
110
111
                var_name, var_value = m.group(1), m.group(2)
                env_vars.setdefault(var_name, set()).add(var_value)
        for key, values in sorted(env_vars.items()):
            if len(values) == 1:
                print(f"{key}: {next(iter(values))}")
            else:
                print(
                    f"{key}: {', '.join(sorted(values))} (WARNING: Different values across ranks)"
                )
one's avatar
one committed
112
113
        print()

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    def _extract_and_print(self, title, filter_func, fields, mandatory, sort_cols, move_rank=True):
        """
        Generic function to extract structured data from log lines and print as a table.

        This function handles the common workflow for tabular report sections like
        (Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
        free-form sections like System Information or User-defined Environment Variables.

        Workflow:
        1. Filter relevant log lines
        2. Extract fields using regex patterns with validation
        3. Clean and validate the data
        4. Reorder columns for readability
        5. Sort and print the table

        Args:
            title: Section title to display (e.g., "Graph Info")
            filter_func: Function to filter relevant log lines (content -> bool)
            fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
                   - pattern: Regex pattern to match the field key (e.g., r"protocol")
                   - col_name: Name of the DataFrame column
                   - value_pattern: Regex pattern to validate/extract the field value
            mandatory: List of column names that must not be NaN (drop rows missing these)
            sort_cols: List of column names to sort by (in order)
            move_rank: If True, move "rank" column to front and "protocol" to second if present
        """
        print(f"===> {title}:\n")

        # Filter relevant log lines using the provided filter function
        data = [(r, c) for (r, c), _ in self.log_entries.items() if filter_func(c)]
        if not data:
            print("  (No data found)\n")
one's avatar
one committed
146
147
            return

148
149
150
151
152
153
154
155
156
157
158
159
160
        # Create DataFrame and extract all fields using regex with validation
        df = pd.DataFrame(data, columns=["rank", "raw_log"])
        for pattern, (col_name, val_pattern) in fields.items():
            # Extract field with strict value validation using word boundary
            df[col_name] = df["raw_log"].str.extract(
                rf"\b{pattern}\s+({val_pattern})", expand=False
            )

        # Convert numeric fields to appropriate types
        numeric_columns = [
            "Pattern",
            "nbytes",
            "nchannels",
161
162
163
164
165
166
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
167
168
169
170
171
172
173
            "crossNic",
            "nChannels",
            "sameChannels",
            "slicesteps",
            "nloops",
            "nsteps",
            "chunksize",
174
        ]
175
        for col in numeric_columns:
176
177
178
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
        # Clean data - drop invalid rows and duplicates
        # Only keep columns that actually exist in the DataFrame
        mandatory = [c for c in mandatory if c in df.columns]
        df.dropna(subset=mandatory, inplace=True)  # Remove rows missing mandatory fields
        df.drop(columns=["raw_log"], inplace=True)  # No longer need raw log
        df.drop_duplicates(inplace=True)  # Deduplicate identical records

        if df.empty:
            print("  (No valid data found)\n")
            return

        # Reorder columns for better readability
        if move_rank:
            cols = df.columns.tolist()
            cols.remove("rank")
            # Move protocol to second position if present
            if "protocol" in cols:
                cols.remove("protocol")
                cols.insert(0, "protocol")
            # Always move rank to first position
            cols.insert(0, "rank")
            df = df[cols]
201

202
        # Sort the data
203
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
204
205
206
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

207
208
209
210
211
        # Format integer columns to avoid trailing .0
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: str(int(x)) if pd.notna(x) else x)

212
        # Print the final table with NaN values replaced by "-"
one's avatar
one committed
213
214
        print(df.fillna("-").to_string(index=False))
        print()
215
216
217
218
219
220
221
222
223
224

    def _report_graph_info(self):
        self._extract_and_print(
            title="Graph Info",
            filter_func=lambda c: "Pattern" in c and "crossNic" in c,
            fields=self.graph_info_fields,
            mandatory=["Pattern"],
            sort_cols=["rank", "Pattern"],
        )

one's avatar
one committed
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    def _report_channel_transport_info(self, verbose=False):
        print("===> Channel Transport Info:\n")
        data = []

        # Match pattern: Channel 00/0 : 2[5d000] -> 1[56000] [send] via NET/IB/6/GDRDMA
        # Group 1: channel (e.g., 00/0)
        # Group 2: src (e.g., 2)
        # Group 3: dst (e.g., 1)
        # Group 4: type (e.g., send or receive, optional)
        # Group 5: transport (e.g., P2P/IPC, NET/IB/6/GDRDMA)
        pattern = re.compile(
            r"Channel\s+(\d+/\d+)\s+:\s+(\d+)\[.*?\]\s+->\s+(\d+)\[.*?\]"
            r"(?: \[(\w+)\])?\s+via\s+([\w/]+)"
        )

        for (rank, content), _ in self.log_entries.items():
            m = pattern.search(content)
            if m:
                channel, src, dst, type_, transport = m.groups()
                data.append(
                    {
                        "rank": rank,
                        "channel": channel,
                        "sender": int(src),
                        "receiver": int(dst),
                        "type": type_ if type_ else "-",
                        "transport": transport,
                    }
                )

        if not data:
            print("  (No data found)\n")
            return

        df = pd.DataFrame(data)
        df.sort_values(by=["rank", "channel", "sender", "receiver"], inplace=True)
        if not verbose:
            df = df.drop(columns=["channel", "sender", "receiver"])
        df.drop_duplicates(inplace=True)
        print(df.to_string(index=False))
        print()

267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
    def _report_cl_transfers(self):
        self._extract_and_print(
            title="Unique Ring/Tree Transfers",
            filter_func=lambda c: "protocol" in c and "nbytes" in c,
            fields=self.cl_transfer_fields,
            mandatory=["protocol", "nbytes"],
            sort_cols=["rank", "nbytes", "protocol", "nchannels"],
        )

    def _report_p2p_transfers(self):
        self._extract_and_print(
            title="Unique P2P Transfers",
            filter_func=lambda c: "p2p :" in c and "send rank" in c,
            fields=self.p2p_fields,
            mandatory=["local", "send", "recv"],
            sort_cols=["rank", "protocol", "local", "send", "recv"],
        )