Commit a95f20e8 authored by one's avatar one
Browse files

[xcl-lens] Fix filters

parent deed39a3
......@@ -70,7 +70,8 @@ class RcclLogParser:
self._report_sys()
self._report_user_envs()
self._report_gdr_info()
self._report_net_ib_info()
self._report_gdr_rw_info()
self._report_graph_info()
self._report_channel_transport_info()
self._report_collective_transfers()
......@@ -83,8 +84,8 @@ class RcclLogParser:
# Preferred format:
# <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
# where <host> itself does NOT contain ':' (so we always stop at first colon)
match = re.search(
r"^([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
match = re.match(
r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
line,
)
if match:
......@@ -136,11 +137,10 @@ class RcclLogParser:
)
print()
def _report_gdr_info(self):
"""Parse and print GPU Direct RDMA (GDR) related information."""
print("===> GDR Info:\n")
def _report_net_ib_info(self):
"""Parse and print NET/IB GPU Direct RDMA HCA information."""
print("===> NET/IB Info:\n")
# Part 1: NET/IB : GPU Direct RDMA Enabled for HCA <hca_no> '<hca_id>'
ib_rows = []
pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
for (host, rank, content), _ in self.log_entries.items():
......@@ -157,7 +157,6 @@ class RcclLogParser:
}
)
print(" NET/IB : GPU Direct RDMA Enabled for:\n")
if ib_rows:
df_ib = pd.DataFrame(ib_rows)
df_ib.drop_duplicates(inplace=True)
......@@ -168,7 +167,10 @@ class RcclLogParser:
else:
print(" (No data found)\n")
# Part 2: GPU Direct RDMA Enabled for GPU <gpu> / HCA <hca_no> (distance <expr>), read <0|1>
def _report_gdr_rw_info(self):
"""Parse and print GPU Direct RDMA read/write information."""
print("===> GDR R/W Info:\n")
gpu_rows = []
pattern_gpu = re.compile(
r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
......@@ -198,7 +200,6 @@ class RcclLogParser:
}
)
print(" GPU Direct RDMA Enabled for GPU:\n")
if gpu_rows:
df_gpu = pd.DataFrame(gpu_rows)
df_gpu.drop_duplicates(inplace=True)
......@@ -390,7 +391,7 @@ class RcclLogParser:
return
df = pd.DataFrame(data)
df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
df.drop_duplicates(inplace=True)
df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
print(df.to_string(index=False))
print()
......@@ -40,7 +40,13 @@ def run_with_input(
# Case 3: Execute as command
return _execute_command(
summary=summary, verbose=verbose, rank=rank, log_prefix=log_prefix, cmd=command
summary=summary,
verbose=verbose,
hosts=hosts,
ranks=ranks,
rank=rank,
log_prefix=log_prefix,
cmd=command,
)
......@@ -84,14 +90,23 @@ def _process_files(
return 0
def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str, cmd: list[str]):
def _execute_command(
*,
summary: bool,
verbose: bool,
hosts: list[str] | None,
ranks: list[str] | None,
rank: int,
log_prefix: str,
cmd: list[str],
):
env = os.environ.copy()
env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "ALL"
print(f"{log_prefix} [Wrapper] Running command: {' '.join(cmd)}")
parser = RcclLogParser()
parser = RcclLogParser(verbose=verbose, hosts=hosts, ranks=ranks)
process = subprocess.Popen(
cmd,
env=env,
......@@ -110,6 +125,6 @@ def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str
process.wait()
if rank == 0:
parser.report(verbose=verbose)
parser.report()
return process.returncode
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment