Commit 02d08801 authored by one's avatar one
Browse files

[xcl-lens] Refactor RcclLogParser to use structured logging with validation

parent bf904971
......@@ -5,7 +5,8 @@ import pandas as pd
class RcclLogParser:
def __init__(self):
self.output = set()
# (rank, content) -> None
self.log_entries = dict()
self.raw_lines = set()
# Pattern -> output string or as-is
......@@ -19,37 +20,37 @@ class RcclLogParser:
r"Disabled GDRCopy": "GDRCopy: disabled",
}
# Pattern -> column
# Pattern -> column with strict validation
self.graph_info_fields = {
r"Pattern": "Pattern",
r"crossNic": "crossNic",
r"nChannels": "nChannels",
r"bw": "bandwidth",
r"type": "type",
r"sameChannels": "sameChannels",
r"Pattern": ("Pattern", r"\d+"),
r"crossNic": ("crossNic", r"\d+"),
r"nChannels": ("nChannels", r"\d+"),
r"bw": ("bandwidth", r"[\d.]+/[\d.]+"),
r"type": ("type", r"[\w/]+"),
r"sameChannels": ("sameChannels", r"\d+"),
}
# Pattern -> column
# Pattern -> column with strict validation
self.cl_transfer_fields = {
r"protocol": "protocol",
r"nbytes": "nbytes",
r"algorithm": "algorithm",
r"slicesteps": "slicesteps",
r"nchannels": "nchannels",
r"nloops": "nloops",
r"nsteps": "nsteps",
r"chunksize": "chunksize",
r"protocol": ("protocol", r"Simple|LL|LL128"),
r"nbytes": ("nbytes", r"\d+"),
r"algorithm": ("algorithm", r"Tree|Ring"),
r"slicesteps": ("slicesteps", r"\d+"),
r"nchannels": ("nchannels", r"\d+"),
r"nloops": ("nloops", r"\d+"),
r"nsteps": ("nsteps", r"\d+"),
r"chunksize": ("chunksize", r"\d+"),
}
# Pattern -> column
# Pattern -> column with strict validation
self.p2p_fields = {
r"p2p : rank": "local",
r"send rank": "send",
r"recv rank": "recv",
r"p2pnChannelsPerPeer": "p2pnChannelsPerPeer",
r"p2pnChannels": "p2pnChannels",
r"nChannelsMax": "nChannelsMax",
r"protocol": "protocol",
r"p2p : rank": ("local", r"\d+"),
r"send rank": ("send", r"\d+"),
r"recv rank": ("recv", r"\d+"),
r"p2pnChannelsPerPeer": ("p2pnChannelsPerPeer", r"\d+"),
r"p2pnChannels": ("p2pnChannels", r"\d+"),
r"nChannelsMax": ("nChannelsMax", r"\d+"),
r"protocol": ("protocol", r"Simple|LL|LL128"),
}
def collect(self, line):
......@@ -71,121 +72,88 @@ class RcclLogParser:
print(" End of Report ".center(80, "="))
def _preprocess_line(self, line):
match = re.search(r"\[\d+\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
"""Extract and validate NCCL log lines with rank information"""
# Match lines that have a valid NCCL log format with rank
# Pattern: [rank] NCCL INFO/WARN/ERROR followed by content
match = re.search(r"\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", line)
if match:
self.output.add(match.group(1))
rank, content = int(match.group(1)), match.group(2)
if len(content) >= 20:
self.log_entries[(rank, content)] = None
def _report_sys(self):
"""Search patterns and print pre-defined strings if matched"""
print("===> System Information:\n")
reported_lines = []
for line in self.output:
for pattern, output in self.sys_patterns.items():
if re.search(pattern, line, re.IGNORECASE):
reported_lines.append(output if output else line)
reported = set()
for (_, content), _ in self.log_entries.items():
for pattern, out in self.sys_patterns.items():
if re.search(pattern, content, re.IGNORECASE):
reported.add(out if out else content)
break
for line in reported_lines:
for line in sorted(reported):
print(line)
print()
def _report_user_envs(self):
"""Search environment variables set by user"""
print("===> User-defined Environment Variables:\n")
env_vars = {}
pattern = re.compile(r"(\w+)\s+set by environment to\s+(.+)")
for line in self.output:
m = pattern.search(line)
for (_, content), _ in self.log_entries.items():
m = pattern.search(content)
if m:
print(f"{m.group(1)}: {m.group(2)}")
env_vars[m.group(1)] = m.group(2)
for key, value in sorted(env_vars.items()):
print(f"{key}: {value}")
print()
def _report_graph_info(self):
"""Extract graph information"""
print("===> Graph Info:\n")
# Filter lines by looking for 'Pattern' and 'crossNic'
filtered_lines = [line for line in self.output if "Pattern" in line and "crossNic" in line]
if not filtered_lines:
print(" (No graph info found)\n")
return
df = pd.DataFrame(filtered_lines, columns=["raw_log"])
# Extract each field independently (order-agnostic)
# Values are comma-separated, so use [^,\s]+ to exclude trailing commas
for pattern, col_name in self.graph_info_fields.items():
df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+([^,\s]+)", expand=False)
# Type conversion for correct sorting
if "Pattern" in df.columns:
df["Pattern"] = pd.to_numeric(df["Pattern"], errors="coerce")
# Clean up
df.drop(columns=["raw_log"], inplace=True)
df.drop_duplicates(inplace=True)
df.sort_values(by="Pattern", ascending=False, inplace=True)
print(df.fillna("-").to_string(index=False))
print()
def _report_cl_transfers(self):
"""Extract non-P2P transfer arguments"""
print("===> Unique Ring/Tree Transfers:\n")
# Filter lines by looking for 'protocol' and 'nbytes'
raw_lines = [line for line in self.output if "protocol" in line and "nbytes" in line]
if not raw_lines:
print(" (No transfer patterns found)\n")
def _extract_and_print(self, title, filter_func, fields, mandatory, sort_cols, move_rank=True):
"""
Generic function to extract structured data from log lines and print as a table.
This function handles the common workflow for tabular report sections like
(Graph Info, Ring/Tree Transfers, P2P Transfers). Does NOT apply to
free-form sections like System Information or User-defined Environment Variables.
Workflow:
1. Filter relevant log lines
2. Extract fields using regex patterns with validation
3. Clean and validate the data
4. Reorder columns for readability
5. Sort and print the table
Args:
title: Section title to display (e.g., "Graph Info")
filter_func: Function to filter relevant log lines (content -> bool)
fields: Dict of {pattern: (col_name, value_pattern)} for field extraction
- pattern: Regex pattern to match the field key (e.g., r"protocol")
- col_name: Name of the DataFrame column
- value_pattern: Regex pattern to validate/extract the field value
mandatory: List of column names that must not be NaN (drop rows missing these)
sort_cols: List of column names to sort by (in order)
move_rank: If True, move "rank" column to front and "protocol" to second if present
"""
print(f"===> {title}:\n")
# Filter relevant log lines using the provided filter function
data = [(r, c) for (r, c), _ in self.log_entries.items() if filter_func(c)]
if not data:
print(" (No data found)\n")
return
df = pd.DataFrame(raw_lines, columns=["raw_log"])
# Extract all fields using a single loop
for pattern, col_name in self.cl_transfer_fields.items():
df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+(\S+)", expand=False)
# Type conversion for correct sorting
for field in ["nbytes", "nchannels"]:
if field in df.columns:
df[field] = pd.to_numeric(df[field], errors="coerce")
# Drop rows where mandatory fields are missing
mandatory_cols = [c for c in ["protocol", "nbytes"] if c in df.columns]
df.dropna(subset=mandatory_cols, inplace=True)
# Clean up
df.drop(columns=["raw_log"], inplace=True)
df.drop_duplicates(inplace=True)
sort_cols = ["nbytes", "protocol", "nchannels"]
sort_cols = [col for col in sort_cols if col in df.columns]
if sort_cols:
df.sort_values(by=sort_cols, inplace=True)
# Fill NaNs with "-" and print
print(df.fillna("-").to_string(index=False))
print()
def _report_p2p_transfers(self):
"""Extract P2P transfer details"""
print("===> Unique P2P Transfers:\n")
# Filter lines by looking for 'p2p :' and 'send rank'
raw_lines = [line for line in self.output if "p2p :" in line and "send rank" in line]
if not raw_lines:
print(" (No P2P transfers found)\n")
return
# Extract all fields using a single loop
df = pd.DataFrame(raw_lines, columns=["raw_log"])
for pattern, col_name in self.p2p_fields.items():
df[col_name] = df["raw_log"].str.extract(rf"{pattern}\s+(\S+)", expand=False)
# Type conversion for correct sorting
numeric_cols = [
# Create DataFrame and extract all fields using regex with validation
df = pd.DataFrame(data, columns=["rank", "raw_log"])
for pattern, (col_name, val_pattern) in fields.items():
# Extract field with strict value validation using word boundary
df[col_name] = df["raw_log"].str.extract(
rf"\b{pattern}\s+({val_pattern})", expand=False
)
# Convert numeric fields to appropriate types
numeric_columns = [
"Pattern",
"nbytes",
"nchannels",
"local",
"send",
"recv",
......@@ -193,26 +161,65 @@ class RcclLogParser:
"p2pnChannels",
"nChannelsMax",
]
for col in numeric_cols:
for col in numeric_columns:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Clean up
df.drop(columns=["raw_log"], inplace=True)
df.drop_duplicates(inplace=True)
# Clean data - drop invalid rows and duplicates
# Only keep columns that actually exist in the DataFrame
mandatory = [c for c in mandatory if c in df.columns]
df.dropna(subset=mandatory, inplace=True) # Remove rows missing mandatory fields
df.drop(columns=["raw_log"], inplace=True) # No longer need raw log
df.drop_duplicates(inplace=True) # Deduplicate identical records
sort_cols = ["protocol", "local", "send", "recv"]
sort_cols = [c for c in sort_cols if c in df.columns]
if sort_cols:
df.sort_values(by=sort_cols, inplace=True)
if df.empty:
print(" (No valid data found)\n")
return
# Move 'protocol' to the first column
# Reorder columns for better readability
if move_rank:
cols = df.columns.tolist()
cols.remove("rank")
# Move protocol to second position if present
if "protocol" in cols:
cols.remove("protocol")
cols.insert(0, "protocol")
# Always move rank to first position
cols.insert(0, "rank")
df = df[cols]
# Fill NaNs with "-" and print
# SSort the data
sort_cols = [c for c in sort_cols if c in df.columns]
if sort_cols:
df.sort_values(by=sort_cols, inplace=True)
# Print the final table with NaN values replaced by "-"
print(df.fillna("-").to_string(index=False))
print()
def _report_graph_info(self):
self._extract_and_print(
title="Graph Info",
filter_func=lambda c: "Pattern" in c and "crossNic" in c,
fields=self.graph_info_fields,
mandatory=["Pattern"],
sort_cols=["rank", "Pattern"],
)
def _report_cl_transfers(self):
self._extract_and_print(
title="Unique Ring/Tree Transfers",
filter_func=lambda c: "protocol" in c and "nbytes" in c,
fields=self.cl_transfer_fields,
mandatory=["protocol", "nbytes"],
sort_cols=["rank", "nbytes", "protocol", "nchannels"],
)
def _report_p2p_transfers(self):
self._extract_and_print(
title="Unique P2P Transfers",
filter_func=lambda c: "p2p :" in c and "send rank" in c,
fields=self.p2p_fields,
mandatory=["local", "send", "recv"],
sort_cols=["rank", "protocol", "local", "send", "recv"],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment