# Copyright (c) Microsoft Corporation. # Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. # Licensed under the MIT License. # example run # python3 ./[rccl]/tools/scripts/npkit_trace_generator.py --npkit_dump_dir=[npkit_dump_dir] --npkit_event_header_path=[rccl]/src/include/npkit/npkit_event.h --output_dir=/home/akollias/dev/ import argparse import os import json from queue import Queue #[rank][buf][id] MAX_RANK = 32 MAX_BUF = 32 MAX_ID = 256 MAX_GPU_BUF_EVENT_NUM = 1024 * 1024 MAX_CPU_BUF_EVENT_NUM = 2 * 1024 * 1024 WARM_NUM = 10 RUN_NUM = 20 PRE_NUM = 1 CHECK_NUM = 1 OUT_START1 = WARM_NUM + PRE_NUM + 1 #12 OUT_END1 = OUT_START1 + RUN_NUM - 1 #31 IN_START1 = WARM_NUM + PRE_NUM + RUN_NUM + CHECK_NUM + PRE_NUM + 1 #34 IN_END1 = IN_START1 + RUN_NUM - 1 #53 ALL_NUM = WARM_NUM + PRE_NUM + RUN_NUM + CHECK_NUM + PRE_NUM + RUN_NUM + CHECK_NUM #54 OP_ALLREDUCE_RING = 0x2 OP_ALLREDUCE_TREE = 0x4 OP_ALLGATHER = 0x51 OP_SENDL = 0x4A OP_SEND = 0x4C OP_RECV = 0x4E OP_BCAST = 0x6B op_group = [OP_ALLREDUCE_RING, OP_ALLREDUCE_TREE, OP_ALLGATHER, OP_SENDL, OP_SEND, OP_RECV, OP_BCAST] gpu_io_count = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_sum_tm = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_avg_tm = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_sum_dt = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_avg_dt = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_avg_bw = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_sum_size = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_io_size = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_avg_iosize = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] cpu_count = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_sum_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_io_size = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_sum_size = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_avg_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_sum_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_avg_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] #[rank][id] gpu_rank_avg_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] gpu_rank_avg_dt = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] gpu_rank_avg_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] gpu_rank_avg_iosize = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)] cpu_event_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_event_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] gpu_op_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] #default do not show rank buf data show_rank_buf = 0 #default do not generate json file need_dump_json = 0 #default use normal rccl test -n 20 rccl_normal_test = 1 class Stack: def __init__(self): self.items = [] def push(self, item): self.items.append(item) def pop(self): if not self.is_empty(): return self.items.pop() else: return "堆栈为空, 无法执行pop操作" def peek(self): if not self.is_empty(): return self.items[-1] else: return "堆栈为空" def is_empty(self): return len(self.items) == 0 def size(self): return len(self.items) def parse_npkit_event_header(npkit_event_header_path): npkit_event_def = {'id_to_type': {}, 'type_to_id': {}} with open(npkit_event_header_path, 'r') as f: lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0] line_idx = 0 while line_idx < len(lines): if lines[line_idx].startswith('#define NPKIT_EVENT_'): fields = lines[line_idx].split() if len(fields) == 3: event_type = fields[1] event_id = int(fields[2], 0) npkit_event_def['type_to_id'][event_type] = event_id npkit_event_def['id_to_type'][event_id] = event_type line_idx += 1 return npkit_event_def def parse_gpu_clock_scale(gpu_clock_file_path): with open(gpu_clock_file_path, 'r') as f: freq_in_khz = f.read() return float(freq_in_khz) * 1e3 / 1e6 def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path): with open(cpu_clock_num_file_path, 'r') as f: num = float(f.read()) with open(cpu_clock_den_file_path, 'r') as f: den = float(f.read()) return den / num / 1e6 def parse_gpu_event(event_bytes): return { 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) } def parse_cpu_event(event_bytes): return { 'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False), 'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False), 'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False), 'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False) } def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale): gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx)) raw_event_size = 16 curr_cpu_base_time = None curr_gpu_base_time = None gpu_events = [] event_type_to_seq = {} mstack = Stack() #print("open gpu file:", gpu_event_file_path, "rank:", rank, "buf_idx:", buf_idx, "gpu_clock_scale", gpu_clock_scale, "cpu_clock_scale", cpu_clock_scale) with open(gpu_event_file_path, 'rb') as f: raw_content = f.read() raw_content_size = len(raw_content) raw_content_idx = 0 while raw_content_idx < raw_content_size: parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) gpu_event_count[rank][buf_idx] += 1 #if rank == 0: print("parsed id:", parsed_gpu_event['id'], "size:",parsed_gpu_event['size'], "rsvd:", parsed_gpu_event['rsvd'], "timestamp:", parsed_gpu_event['timestamp']) if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU': if curr_cpu_base_time is None: curr_cpu_base_time = parsed_gpu_event['timestamp'] / 1000 #curr_gpu_base_time = None elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU': if curr_gpu_base_time is None: curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale else: if curr_gpu_base_time is None: curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']] phase = 'B' if event_type.endswith('_ENTRY') else 'E' ts_time_us = curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time #if rank == 0: print(f"parsed rank:{rank} buf:{buf_idx} id:{parsed_gpu_event['id']} size:{parsed_gpu_event['size']} gpu_clock:{gpu_clock_scale} ts:{ts_time_us}") gpu_events.append({ 'ph': phase, 'ts': ts_time_us, # time unit is usec 'pid': rank, 'tid': buf_idx + 1 }) if phase == 'B': if event_type not in event_type_to_seq: event_type_to_seq[event_type] = 0 gpu_events[-1].update({ 'name': event_type, 'cat': 'GPU', 'args': { 'rank': rank, 'buf_idx': buf_idx, 'seq': event_type_to_seq[event_type], 'rsvd_0': parsed_gpu_event['rsvd'], 'size_0': parsed_gpu_event['size'] } }) event_type_to_seq[event_type] += 1 mstack.push(ts_time_us) #print(f"--push--rank:{rank} buf_idx:{buf_idx} cur_id:{parsed_gpu_event['id']} ts_time_us:{ts_time_us}") else: gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']} prev_time = mstack.pop() #print(f"==pop==rank:{rank} buf_idx:{buf_idx} cur_id:{parsed_gpu_event['id']} prev_time:{prev_time}") delta_time = gpu_events[-1]['ts'] - prev_time data_time = parsed_gpu_event['rsvd'] / gpu_clock_scale gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3 cur_id = parsed_gpu_event['id'] if cur_id in op_group: gpu_op_count[rank][buf_idx] += 1 if check_op(gpu_op_count[rank][buf_idx]): if gpu_io_size[rank][buf_idx][cur_id] == 0: gpu_io_size[rank][buf_idx][cur_id] = parsed_gpu_event['size'] gpu_io_count[rank][buf_idx][cur_id] += 1 gpu_sum_tm[rank][buf_idx][cur_id] += delta_time gpu_sum_dt[rank][buf_idx][cur_id] += data_time gpu_sum_size[rank][buf_idx][cur_id] += parsed_gpu_event['size'] #if rank == 0: print(f"rank:{rank} buf_idx:{buf_idx} {npkit_event_def['id_to_type'][cur_id]} {round(delta_time, 3):10} op_count:{gpu_op_count[rank][buf_idx]}") #if rank == 0: print(f"parsed rank:{rank} buf:{buf_idx} id:{cur_id} prev:{prev_time} now:{gpu_events[-1]['ts']} delta:{round(delta_time, 3)} dcost:{data_time}") raw_content_idx += raw_event_size if show_rank_buf == 1: print("------------------------------------------------- show rank:", rank, "buf:", buf_idx, "-------------------------------------------------") print("%22s %51s %10s %10s %10s %10s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "AvgDt(us)", "BW(GB/s)", "Count")) for i in range(MAX_ID): if gpu_io_count[rank][buf_idx][i] == 0: continue gpu_avg_tm[rank][buf_idx][i] = gpu_sum_tm[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i] gpu_avg_dt[rank][buf_idx][i] = gpu_sum_dt[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i] if gpu_sum_tm[rank][buf_idx][i]: gpu_avg_bw[rank][buf_idx][i] = gpu_sum_size[rank][buf_idx][i] / gpu_sum_tm[rank][buf_idx][i] / 1e3 gpu_avg_iosize[rank][buf_idx][i] = gpu_sum_size[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i] if gpu_io_size[rank][buf_idx][i] * gpu_io_count[rank][buf_idx][i] != gpu_sum_size[rank][buf_idx][i] and rank == 0 and buf_idx == 0: print(f"===note===gpu has diff io size rank:{rank} buf:{buf_idx} id:{i} " f"sum_iocount_size:{gpu_io_size[rank][buf_idx][i] * gpu_io_count[rank][buf_idx][i]} sum_size:{gpu_sum_size[rank][buf_idx][i]} " f"record_io_size:{gpu_io_size[rank][buf_idx][i]} avg_io_size:{int(gpu_avg_iosize[rank][buf_idx][i])}") if show_rank_buf == 1: print(f"[{i:2}]{npkit_event_def['id_to_type'][i]:70} {gpu_io_size[rank][buf_idx][i]:10} {round(gpu_avg_tm[rank][buf_idx][i], 3):10}" f" {round(gpu_avg_dt[rank][buf_idx][i], 3):10} {round(gpu_avg_bw[rank][buf_idx][i], 3):10} {gpu_io_count[rank][buf_idx][i]:6}") if mstack.size() != 0: print(f"===Warning===gpu rank:{rank} buf_idx:{buf_idx} invalid stack size:{mstack.size()}! event_num:{gpu_event_count[rank][buf_idx]} max:{MAX_GPU_BUF_EVENT_NUM}") return gpu_events def check_op(op_count): if rccl_normal_test == 0: return 1 if (op_count >= OUT_START1 and op_count <= OUT_END1) or (op_count >= IN_START1 and op_count <= IN_END1): return 1 else: return 0 def get_all_bw(event_type, channel_bw, nbuf): if "ALGO" in event_type: return channel_bw else: return channel_bw * nbuf def show_result(npkit_event_def, nrank, nbuf): for i in range(nrank): for j in range(nbuf): if gpu_op_count[i][j] != ALL_NUM and rccl_normal_test: print(f"===Warning===gpu rank:{i} buf_idx:{j} invalid gpu_op_count:{gpu_op_count[i][j]} ALL:{ALL_NUM}") for i in range(MAX_ID): for j in range(nrank): sum_tm = 0 sum_dt = 0 sum_bw = 0 sum_io = 0 for k in range(nbuf): sum_tm += gpu_avg_tm[j][k][i] sum_dt += gpu_avg_dt[j][k][i] sum_bw += gpu_avg_bw[j][k][i] sum_io += gpu_avg_iosize[j][k][i] if sum_tm > 0: gpu_rank_avg_tm[j][i] = sum_tm / nbuf gpu_rank_avg_dt[j][i] = sum_dt / nbuf gpu_rank_avg_bw[j][i] = sum_bw / nbuf gpu_rank_avg_iosize[j][i] = sum_io / nbuf print(f"\nNOTE:\n INPUT: get data from user input buff\n RECV: get data from prev gpu\nREDUCE: do data reduce operate\n SEND: send data to next gpu\nOUTPUT: send data to user output buff") for i in range(nrank): print(" ") print("========================================================== gpu show rank:", i, "buf_num:", nbuf, "==========================================================") print("%23s %51s %10s %10s %10s %10s %13s %12s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "AvgDt(us)", "Dt/Tm(%)", "ChanBW(GB/s)", "AllBW(GB/s)", "Count")) for j in range(MAX_ID): if gpu_rank_avg_tm[i][j] > 0: #io_size = gpu_io_size[0][0][j] if gpu_io_size[0][0][j] else gpu_io_size[0][1][j] io_size = int(gpu_rank_avg_iosize[i][j]) io_count = gpu_io_count[i][0][j] if gpu_io_count[i][0][j] else gpu_io_count[i][1][j] print(f"[{j:3}]{npkit_event_def['id_to_type'][j]:70} {io_size:10} {round(gpu_rank_avg_tm[i][j], 3):10}" f"{round(gpu_rank_avg_dt[i][j], 3):11} {round(gpu_rank_avg_dt[i][j] * 100 /gpu_rank_avg_tm[i][j], 1):10} {round(gpu_rank_avg_bw[i][j], 3):13}" f"{round(get_all_bw(npkit_event_def['id_to_type'][j], gpu_rank_avg_bw[i][j], nbuf), 3):12} {io_count:6}") for i in range(nrank): for j in range(MAX_ID): if cpu_count[i][j] == 0: continue cpu_avg_tm[i][j] = cpu_sum_tm[i][j] / cpu_count[i][j] cpu_avg_bw[i][j] = cpu_sum_size[i][j] / cpu_sum_tm[i][j] / 1e3 if cpu_count[i][j] * cpu_io_size[i][j] != cpu_sum_size[i][j]: print(f"===Warning===cpu invalid sum size rank:{i} id:{j} sum_iocount_size:{cpu_io_size[i][j] * cpu_count[i][j]} sum_size:{cpu_sum_size[i][j]}") for i in range(nrank): print(" ") print("================================================= cpu show rank:", i, "=================================================") print("%23s %51s %10s %10s %10s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "BW(GB/s)", "Count")) for j in range(MAX_ID): if cpu_avg_tm[i][j] > 0: print(f"[{j:3}]{npkit_event_def['id_to_type'][j]:70} {cpu_io_size[i][j]:10} {round(cpu_avg_tm[i][j], 3):10} {round(cpu_avg_bw[i][j], 3):10} {cpu_count[i][j]:6}") print(" ") def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale): cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel)) raw_event_size = 16 cpu_events = [] event_type_to_seq = {} fiber_is_usable = [] fiber_open_ts = [] slot_to_fiber_id = {} channel_shift = 1000 with open(cpu_event_file_path, 'rb') as f: raw_content = f.read() raw_content_size = len(raw_content) raw_content_idx = 0 while raw_content_idx < raw_content_size: parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size]) #print("parsed cpu id:", parsed_cpu_event['id'], "timestamp:", parsed_cpu_event['timestamp'], "ts:", parsed_cpu_event['timestamp'] / 1000) event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']] phase = 'B' if event_type.endswith('_ENTRY') else 'E' cpu_events.append({ 'ph': phase, 'ts': parsed_cpu_event['timestamp'] / 1000, # time unit is usec 'pid': rank }) slot = parsed_cpu_event['slot'] if phase == 'B': # Open fiber event fiber_id = 0 while fiber_id < len(fiber_is_usable): if fiber_is_usable[fiber_id]: break fiber_id += 1 if fiber_id == len(fiber_is_usable): fiber_is_usable.append(True) fiber_open_ts.append(0.0) slot_to_fiber_id[slot] = fiber_id fiber_open_ts[fiber_id] = cpu_events[-1]['ts'] fiber_is_usable[fiber_id] = False if event_type not in event_type_to_seq: event_type_to_seq[event_type] = 0 cpu_events[-1].update({ 'name': event_type, 'cat': 'CPU', 'args': { 'rank': rank, 'channel': channel, 'slot': parsed_cpu_event['slot'], 'seq': event_type_to_seq[event_type], 'size_0': parsed_cpu_event['size'] } }) event_type_to_seq[event_type] += 1 else: # Close fiber event fiber_id = slot_to_fiber_id[slot] slot_to_fiber_id.pop(slot) last_ts = fiber_open_ts[fiber_id] fiber_is_usable[fiber_id] = True delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts) cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']} cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3 cur_id = parsed_cpu_event['id'] cpu_count[rank][cur_id] += 1 cpu_sum_tm[rank][cur_id] += delta_time cpu_sum_size[rank][cur_id] += parsed_cpu_event['size'] if cpu_io_size[rank][cur_id] == 0: cpu_io_size[rank][cur_id] = parsed_cpu_event['size'] elif parsed_cpu_event['size'] != cpu_io_size[rank][cur_id]: print(f"===Warning===cpu rank:{rank} id:{cur_id} invaid io szie:{parsed_cpu_event['size']} recode io size:{cpu_io_size[rank][cur_id]}") #if cur_id == 48: print(f"{event_type:30} cpu_count:{cpu_count[rank][cur_id]:8} iosize:{parsed_cpu_event['size']:8} delta_time:{round(delta_time, 3):8} bw:{round(cpu_events[-1]['args']['bw (GB/s)'], 3):6}") cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift raw_content_idx += raw_event_size return cpu_events def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def): files_in_dump_dir = next(os.walk(npkit_dump_dir))[2] gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')] cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')] ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files])) buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files])) channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files])) print(f"mode:{rccl_normal_test} rank_num:{len(ranks)} buf_num:{len(buf_indices)} OUT_START1:{OUT_START1} OUT_END1:{OUT_END1} IN_START1:{IN_START1} IN_END1:{IN_END1} ALL_NUM:{ALL_NUM}") trace = {'traceEvents': []} for rank in ranks: cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank) cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank) cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path) gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank) gpu_clock_scale_mhz = parse_gpu_clock_scale(gpu_clock_file_path) if gpu_clock_scale_mhz == 0: print(f"===Warning===gpu rank:{rank} gpu clock from file is zero, use default clock rate:25 mhz") gpu_clock_scale_mhz = 25 print(f"rank:{rank} gpu_clock_scale:{gpu_clock_scale_mhz}") for buf_idx in buf_indices: gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale_mhz, cpu_clock_scale) trace['traceEvents'].extend(gpu_events) for channel in channels: cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale) trace['traceEvents'].extend(cpu_events) trace['traceEvents'].sort(key=lambda x : x['ts']) trace['displayTimeUnit'] = 'ns' current_path = os.path.dirname(os.path.abspath(__file__)) display_file_path = os.path.join(current_path, 'npkit_event_display.h') display_event_def = parse_npkit_event_header(display_file_path) show_result(display_event_def, len(ranks), len(buf_indices)) if need_dump_json == 1: os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f: json.dump(trace, f) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.') parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.') parser.add_argument('--output_dir', type=str, required=False, help='Path to output directory.') parser.add_argument('--rccl_normal_test', type=int, required=False, help='Rccl normal test mode.') args = parser.parse_args() if args.output_dir is None: args.output_dir = args.npkit_dump_dir print(f"output_dir:{args.output_dir}") if args.rccl_normal_test is not None: rccl_normal_test = args.rccl_normal_test npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path) convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def)