rccl.py 7.12 KB
Newer Older
one's avatar
one committed
1
import re
2

one's avatar
one committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import pandas as pd


class RcclLogParser:
    def __init__(self):
        self.output = set()
        self.raw_lines = set()

        # Pattern -> output string or as-is
        self.sys_patterns = {
            r"kernel version": None,
            r"ROCr version": None,
            r"RCCL version": None,
            r"Librccl path": None,
            r"iommu": None,
            r"Dmabuf feature disabled": "Dmabuf: disabled",
            r"Disabled GDRCopy": "GDRCopy: disabled",
        }

22
        # Pattern -> column
one's avatar
one committed
23
24
25
26
27
28
29
30
31
        self.graph_info_fields = {
            r"Pattern": "Pattern",
            r"crossNic": "crossNic",
            r"nChannels": "nChannels",
            r"bw": "bandwidth",
            r"type": "type",
            r"sameChannels": "sameChannels",
        }

32
33
        # Pattern -> column
        self.cl_transfer_fields = {
one's avatar
one committed
34
35
36
37
38
39
40
41
42
43
            r"protocol": "protocol",
            r"nbytes": "nbytes",
            r"algorithm": "algorithm",
            r"slicesteps": "slicesteps",
            r"nchannels": "nchannels",
            r"nloops": "nloops",
            r"nsteps": "nsteps",
            r"chunksize": "chunksize",
        }

44
45
46
47
48
49
50
51
52
53
54
        # Pattern -> column
        self.p2p_fields = {
            r"p2p : rank": "local",
            r"send rank": "send",
            r"recv rank": "recv",
            r"p2pnChannelsPerPeer": "p2pnChannelsPerPeer",
            r"p2pnChannels": "p2pnChannels",
            r"nChannelsMax": "nChannelsMax",
            r"protocol": "protocol",
        }

one's avatar
one committed
55
56
57
58
59
60
61
62
63
64
65
66
67
    def collect(self, line):
        self.raw_lines.add(line)

    def report(self):
        print(" RCCL Log Parser Report ".center(80, "="))
        print()

        for line in self.raw_lines:
            self._preprocess_line(line)

        self._report_sys()
        self._report_user_envs()
        self._report_graph_info()
68
69
        self._report_cl_transfers()
        self._report_p2p_transfers()
one's avatar
one committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

        print(" End of Report ".center(80, "="))

    def _preprocess_line(self, line):
        match = re.search(r"\[\d+\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
        if match:
            self.output.add(match.group(1))

    def _report_sys(self):
        """Search patterns and print pre-defined strings if matched"""
        print("===> System Information:\n")
        reported_lines = []
        for line in self.output:
            for pattern, output in self.sys_patterns.items():
                if re.search(pattern, line, re.IGNORECASE):
                    reported_lines.append(output if output else line)
                    break
        for line in reported_lines:
            print(line)
        print()

    def _report_user_envs(self):
        """Search environment variables set by user"""
        print("===> User-defined Environment Variables:\n")
        pattern = re.compile(r"(\w+)\s+set by environment to\s+(.+)")
        for line in self.output:
            m = pattern.search(line)
            if m:
                print(f"{m.group(1)}: {m.group(2)}")
        print()

    def _report_graph_info(self):
102
        """Extract graph information"""
one's avatar
one committed
103
104
105
        print("===> Graph Info:\n")

        # Filter lines by looking for 'Pattern' and 'crossNic'
106
        filtered_lines = [line for line in self.output if "Pattern" in line and "crossNic" in line]
one's avatar
one committed
107
108
109
110
111
112
113

        if not filtered_lines:
            print("  (No graph info found)\n")
            return

        df = pd.DataFrame(filtered_lines, columns=["raw_log"])

114
115
116
117
        # Extract each field independently (order-agnostic)
        # Values are comma-separated, so use [^,\s]+ to exclude trailing commas
        for pattern, col_name in self.graph_info_fields.items():
            df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+([^,\s]+)", expand=False)
one's avatar
one committed
118

119
120
121
        # Type conversion for correct sorting
        if "Pattern" in df.columns:
            df["Pattern"] = pd.to_numeric(df["Pattern"], errors="coerce")
one's avatar
one committed
122

123
124
125
126
        # Clean up
        df.drop(columns=["raw_log"], inplace=True)
        df.drop_duplicates(inplace=True)
        df.sort_values(by="Pattern", ascending=False, inplace=True)
one's avatar
one committed
127

128
        print(df.fillna("-").to_string(index=False))
one's avatar
one committed
129
130
        print()

131
132
133
    def _report_cl_transfers(self):
        """Extract non-P2P transfer arguments"""
        print("===> Unique Ring/Tree Transfers:\n")
one's avatar
one committed
134
135

        # Filter lines by looking for 'protocol' and 'nbytes'
136
        raw_lines = [line for line in self.output if "protocol" in line and "nbytes" in line]
one's avatar
one committed
137
138
139
140
141
142
143
144

        if not raw_lines:
            print("  (No transfer patterns found)\n")
            return

        df = pd.DataFrame(raw_lines, columns=["raw_log"])

        # Extract all fields using a single loop
145
        for pattern, col_name in self.cl_transfer_fields.items():
146
            df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+(\S+)", expand=False)
one's avatar
one committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160

        # Type conversion for correct sorting
        for field in ["nbytes", "nchannels"]:
            if field in df.columns:
                df[field] = pd.to_numeric(df[field], errors="coerce")

        # Drop rows where mandatory fields are missing
        mandatory_cols = [c for c in ["protocol", "nbytes"] if c in df.columns]
        df.dropna(subset=mandatory_cols, inplace=True)

        # Clean up
        df.drop(columns=["raw_log"], inplace=True)
        df.drop_duplicates(inplace=True)

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
        sort_cols = ["nbytes", "protocol", "nchannels"]
        sort_cols = [col for col in sort_cols if col in df.columns]

        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

        # Fill NaNs with "-" and print
        print(df.fillna("-").to_string(index=False))
        print()

    def _report_p2p_transfers(self):
        """Extract P2P transfer details"""
        print("===> Unique P2P Transfers:\n")

        # Filter lines by looking for 'p2p :' and 'send rank'
176
        raw_lines = [line for line in self.output if "p2p :" in line and "send rank" in line]
177
178
179
180
181
182
183
184

        if not raw_lines:
            print("  (No P2P transfers found)\n")
            return

        # Extract all fields using a single loop
        df = pd.DataFrame(raw_lines, columns=["raw_log"])
        for pattern, col_name in self.p2p_fields.items():
185
            df[col_name] = df["raw_log"].str.extract(rf"{pattern}\s+(\S+)", expand=False)
one's avatar
one committed
186

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
        # Type conversion for correct sorting
        numeric_cols = [
            "local",
            "send",
            "recv",
            "p2pnChannelsPerPeer",
            "p2pnChannels",
            "nChannelsMax",
        ]
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # Clean up
        df.drop(columns=["raw_log"], inplace=True)
        df.drop_duplicates(inplace=True)

        sort_cols = ["protocol", "local", "send", "recv"]
        sort_cols = [c for c in sort_cols if c in df.columns]
one's avatar
one committed
206
207
208
        if sort_cols:
            df.sort_values(by=sort_cols, inplace=True)

209
210
211
212
213
214
215
        # Move 'protocol' to the first column
        cols = df.columns.tolist()
        if "protocol" in cols:
            cols.remove("protocol")
            cols.insert(0, "protocol")
            df = df[cols]

one's avatar
one committed
216
217
218
        # Fill NaNs with "-" and print
        print(df.fillna("-").to_string(index=False))
        print()