Commit 3b294cb7 authored by one's avatar one
Browse files

[xcl-lens] Improve sys and env reports

parent d5bae7a2
...@@ -62,49 +62,61 @@ class RcclLogParser: ...@@ -62,49 +62,61 @@ class RcclLogParser:
self.log_entries.add(("-", rank, content)) self.log_entries.add(("-", rank, content))
def _report_sys(self): def _report_sys(self):
"""Search patterns and print pre-defined strings if matched""" sys_specs = [
# Pattern -> output string or as-is (r"kernel version\s*:?\s*(.+)", "kernel version", 1, None),
sys_patterns = { (r"ROCr version\s*:?\s*(.+)", "ROCr version", 1, None),
r"kernel version": None, (r"RCCL version\s*:?\s*(.+)", "RCCL version", 1, None),
r"ROCr version": None, (r"Librccl path\s*:?\s*(.+)", "Librccl path", 1, None),
r"RCCL version": None, (r'(Missing "iommu=pt".*|iommu.*)', "iommu", 1, None),
r"Librccl path": None, (r"Dmabuf feature disabled", "Dmabuf", None, "disabled"),
r"iommu": None, (r"Disabled GDRCopy", "GDRCopy", None, "disabled"),
r"Dmabuf feature disabled": "Dmabuf: disabled", (r"Using network IB", "NET/IB", None, "enabled"),
r"Disabled GDRCopy": "GDRCopy: disabled", (r"NET/Plugin: Could not find: librccl-net.so", "NET/Plugin", None, "internal"),
r"Using network IB": "NET/IB: enabled", (r"XDP is disabled", "XDP", None, "disabled"),
r"NET/Plugin: Could not find: librccl-net.so": "NET/Plugin: internal", ]
r"XDP is disabled": "XDP: disabled",
}
print("===> System Information:\n") records = {}
reported = set() for host, rank, content in self.log_entries:
for _, _, content in self.log_entries: for pattern, field, group_idx, literal in sys_specs:
for pattern, out in sys_patterns.items(): m = re.search(pattern, content, re.IGNORECASE)
if re.search(pattern, content, re.IGNORECASE): if not m:
reported.add(out if out is not None else content) continue
break value = literal if group_idx is None else m.group(group_idx).strip()
for line in sorted(reported): records.setdefault(field, {}).setdefault((host, rank), set()).add(value)
print(line) break
print()
self._print_consistency_report("System Information", records)
def _report_user_envs(self): def _report_user_envs(self):
"""Search environment variables set by user"""
print("===> User-defined Environment Variables:\n")
env_vars = {}
pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)") pattern = re.compile(r"((?:N|R)CCL_\w+)\s+set(?: by environment)? to\s+(.+)")
for _, _, content in self.log_entries: records = {}
for host, rank, content in self.log_entries:
m = pattern.search(content) m = pattern.search(content)
if m: if not m:
var_name, var_value = m.group(1), m.group(2) continue
env_vars.setdefault(var_name, set()).add(var_value) var_name, var_value = m.group(1), m.group(2).strip()
for key, values in sorted(env_vars.items()): records.setdefault(var_name, {}).setdefault((host, rank), set()).add(var_value)
self._print_consistency_report("User-defined Environment Variables", records)
def _print_consistency_report(self, title, records):
print(f"===> {title}:\n")
if not records:
print(" (No data found)\n")
return
for field in sorted(records):
entries = records[field]
values = sorted({value for field_values in entries.values() for value in field_values})
if len(values) == 1: if len(values) == 1:
print(f"{key}: {next(iter(values))}") print(f"{field}: {values[0]}")
else: continue
print(
f"{key}: {', '.join(sorted(values))} (WARNING: Different values across ranks)" print(f"{field}: (WARNING: Different values across ranks)")
) for (host, rank), field_values in sorted(entries.items()):
joined = " | ".join(sorted(field_values))
print(f" {host} rank {rank}: {joined}")
print() print()
def _report_net_ib_info(self): def _report_net_ib_info(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment