Commit a95f20e8 authored by one's avatar one
Browse files

[xcl-lens] Fix filters

parent deed39a3
...@@ -70,7 +70,8 @@ class RcclLogParser: ...@@ -70,7 +70,8 @@ class RcclLogParser:
self._report_sys() self._report_sys()
self._report_user_envs() self._report_user_envs()
self._report_gdr_info() self._report_net_ib_info()
self._report_gdr_rw_info()
self._report_graph_info() self._report_graph_info()
self._report_channel_transport_info() self._report_channel_transport_info()
self._report_collective_transfers() self._report_collective_transfers()
...@@ -83,8 +84,8 @@ class RcclLogParser: ...@@ -83,8 +84,8 @@ class RcclLogParser:
# Preferred format: # Preferred format:
# <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content> # <host>:<pid>:<tid> [rank] NCCL INFO/WARN/ERROR <content>
# where <host> itself does NOT contain ':' (so we always stop at first colon) # where <host> itself does NOT contain ':' (so we always stop at first colon)
match = re.search( match = re.match(
r"^([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)", r"([^:\s]+):\d+:\d+\s+\[(\d+)\]\s+NCCL\s+(?:INFO|WARN|ERROR)\s+(.*)",
line, line,
) )
if match: if match:
...@@ -136,11 +137,10 @@ class RcclLogParser: ...@@ -136,11 +137,10 @@ class RcclLogParser:
) )
print() print()
def _report_gdr_info(self): def _report_net_ib_info(self):
"""Parse and print GPU Direct RDMA (GDR) related information.""" """Parse and print NET/IB GPU Direct RDMA HCA information."""
print("===> GDR Info:\n") print("===> NET/IB Info:\n")
# Part 1: NET/IB : GPU Direct RDMA Enabled for HCA <hca_no> '<hca_id>'
ib_rows = [] ib_rows = []
pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'") pattern_ib = re.compile(r"NET/IB\s+:\s+GPU Direct RDMA Enabled for HCA\s+(\d+)\s+'([^']+)'")
for (host, rank, content), _ in self.log_entries.items(): for (host, rank, content), _ in self.log_entries.items():
...@@ -157,7 +157,6 @@ class RcclLogParser: ...@@ -157,7 +157,6 @@ class RcclLogParser:
} }
) )
print(" NET/IB : GPU Direct RDMA Enabled for:\n")
if ib_rows: if ib_rows:
df_ib = pd.DataFrame(ib_rows) df_ib = pd.DataFrame(ib_rows)
df_ib.drop_duplicates(inplace=True) df_ib.drop_duplicates(inplace=True)
...@@ -168,7 +167,10 @@ class RcclLogParser: ...@@ -168,7 +167,10 @@ class RcclLogParser:
else: else:
print(" (No data found)\n") print(" (No data found)\n")
# Part 2: GPU Direct RDMA Enabled for GPU <gpu> / HCA <hca_no> (distance <expr>), read <0|1> def _report_gdr_rw_info(self):
"""Parse and print GPU Direct RDMA read/write information."""
print("===> GDR R/W Info:\n")
gpu_rows = [] gpu_rows = []
pattern_gpu = re.compile( pattern_gpu = re.compile(
r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*" r"GPU Direct RDMA Enabled for GPU\s+(\S+)\s*/\s*"
...@@ -198,7 +200,6 @@ class RcclLogParser: ...@@ -198,7 +200,6 @@ class RcclLogParser:
} }
) )
print(" GPU Direct RDMA Enabled for GPU:\n")
if gpu_rows: if gpu_rows:
df_gpu = pd.DataFrame(gpu_rows) df_gpu = pd.DataFrame(gpu_rows)
df_gpu.drop_duplicates(inplace=True) df_gpu.drop_duplicates(inplace=True)
...@@ -390,7 +391,7 @@ class RcclLogParser: ...@@ -390,7 +391,7 @@ class RcclLogParser:
return return
df = pd.DataFrame(data) df = pd.DataFrame(data)
df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
df.drop_duplicates(inplace=True) df.drop_duplicates(inplace=True)
df.sort_values(by=["host", "rank", "channel", "sender", "receiver"], inplace=True)
print(df.to_string(index=False)) print(df.to_string(index=False))
print() print()
...@@ -40,7 +40,13 @@ def run_with_input( ...@@ -40,7 +40,13 @@ def run_with_input(
# Case 3: Execute as command # Case 3: Execute as command
return _execute_command( return _execute_command(
summary=summary, verbose=verbose, rank=rank, log_prefix=log_prefix, cmd=command summary=summary,
verbose=verbose,
hosts=hosts,
ranks=ranks,
rank=rank,
log_prefix=log_prefix,
cmd=command,
) )
...@@ -84,14 +90,23 @@ def _process_files( ...@@ -84,14 +90,23 @@ def _process_files(
return 0 return 0
def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str, cmd: list[str]): def _execute_command(
*,
summary: bool,
verbose: bool,
hosts: list[str] | None,
ranks: list[str] | None,
rank: int,
log_prefix: str,
cmd: list[str],
):
env = os.environ.copy() env = os.environ.copy()
env["NCCL_DEBUG"] = "INFO" env["NCCL_DEBUG"] = "INFO"
env["NCCL_DEBUG_SUBSYS"] = "ALL" env["NCCL_DEBUG_SUBSYS"] = "ALL"
print(f"{log_prefix} [Wrapper] Running command: {' '.join(cmd)}") print(f"{log_prefix} [Wrapper] Running command: {' '.join(cmd)}")
parser = RcclLogParser() parser = RcclLogParser(verbose=verbose, hosts=hosts, ranks=ranks)
process = subprocess.Popen( process = subprocess.Popen(
cmd, cmd,
env=env, env=env,
...@@ -110,6 +125,6 @@ def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str ...@@ -110,6 +125,6 @@ def _execute_command(*, summary: bool, verbose: bool, rank: int, log_prefix: str
process.wait() process.wait()
if rank == 0: if rank == 0:
parser.report(verbose=verbose) parser.report()
return process.returncode return process.returncode
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment