# Copyright (c) Microsoft Corporation.
# Modifications Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
# Licensed under the MIT License.

# example run
# python3 ./[rccl]/tools/scripts/npkit_trace_generator.py --npkit_dump_dir=[npkit_dump_dir] --npkit_event_header_path=[rccl]/src/include/npkit/npkit_event.h --output_dir=/home/akollias/dev/

import argparse
import os
import json

from queue import Queue

#[rank][buf][id]
MAX_RANK = 32
MAX_BUF = 32
MAX_ID = 256
MAX_GPU_BUF_EVENT_NUM = 1024 * 1024
MAX_CPU_BUF_EVENT_NUM = 2 * 1024 * 1024

WARM_NUM = 10
RUN_NUM = 20
PRE_NUM = 1
CHECK_NUM = 1
OUT_START1 = WARM_NUM + PRE_NUM + 1  #12
OUT_END1 = OUT_START1 + RUN_NUM - 1  #31
IN_START1 = WARM_NUM + PRE_NUM + RUN_NUM + CHECK_NUM + PRE_NUM + 1  #34 
IN_END1 = IN_START1 + RUN_NUM - 1    #53
ALL_NUM = WARM_NUM + PRE_NUM + RUN_NUM + CHECK_NUM + PRE_NUM + RUN_NUM + CHECK_NUM #54

OP_ALLREDUCE_RING = 0x2
OP_ALLREDUCE_TREE = 0x4
OP_ALLGATHER = 0x51
OP_SENDL = 0x4A
OP_SEND = 0x4C
OP_RECV = 0x4E
OP_BCAST = 0x6B

op_group = [OP_ALLREDUCE_RING, OP_ALLREDUCE_TREE, OP_ALLGATHER, OP_SENDL, OP_SEND, OP_RECV, OP_BCAST]

gpu_io_count = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_sum_tm = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_avg_tm = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_sum_dt = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_avg_dt = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_avg_bw = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_sum_size = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_io_size = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 
gpu_avg_iosize = [[[0 for _ in range(MAX_ID)] for _ in range(MAX_BUF)] for _ in range(MAX_RANK)] 

cpu_count = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_sum_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_io_size = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_sum_size = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_avg_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_sum_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
cpu_avg_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]

#[rank][id]
gpu_rank_avg_tm = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
gpu_rank_avg_dt = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
gpu_rank_avg_bw = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]
gpu_rank_avg_iosize = [[0 for _ in range(MAX_ID)] for _ in range(MAX_RANK)]

cpu_event_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)]
gpu_event_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)]
gpu_op_count = [[0 for _ in range(MAX_BUF)] for _ in range(MAX_RANK)]

#default do not show rank buf data
show_rank_buf = 0
#default do not generate json file
need_dump_json = 0
#default use normal rccl test -n 20
rccl_normal_test = 1

class Stack:  
    def __init__(self):  
        self.items = []  
  
    def push(self, item):  
        self.items.append(item)  
  
    def pop(self):  
        if not self.is_empty():  
            return self.items.pop()  
        else:  
            return "堆栈为空, 无法执行pop操作"  
  
    def peek(self):  
        if not self.is_empty():  
            return self.items[-1]  
        else:  
            return "堆栈为空"  
  
    def is_empty(self):  
        return len(self.items) == 0  
  
    def size(self):  
        return len(self.items)  

def parse_npkit_event_header(npkit_event_header_path):
    npkit_event_def = {'id_to_type': {}, 'type_to_id': {}}
    with open(npkit_event_header_path, 'r') as f:
        lines = [x.strip() for x in f.readlines() if len(x.strip()) != 0]
        line_idx = 0
        while line_idx < len(lines):
            if lines[line_idx].startswith('#define NPKIT_EVENT_'):
                fields = lines[line_idx].split()
                if len(fields) == 3:
                    event_type = fields[1]
                    event_id = int(fields[2], 0)
                    npkit_event_def['type_to_id'][event_type] = event_id
                    npkit_event_def['id_to_type'][event_id] = event_type
            line_idx += 1
    return npkit_event_def

def parse_gpu_clock_scale(gpu_clock_file_path):
    with open(gpu_clock_file_path, 'r') as f:
        freq_in_khz = f.read()
        return float(freq_in_khz) * 1e3 / 1e6

def parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path):
    with open(cpu_clock_num_file_path, 'r') as f:
        num = float(f.read())
    with open(cpu_clock_den_file_path, 'r') as f:
        den = float(f.read())
    return den / num / 1e6

def parse_gpu_event(event_bytes):
    return {
        'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
        'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
        'rsvd': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
        'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
    }

def parse_cpu_event(event_bytes):
    return {
        'id': int.from_bytes(event_bytes[0:1], byteorder='little', signed=False),
        'size': int.from_bytes(event_bytes[1:5], byteorder='little', signed=False),
        'slot': int.from_bytes(event_bytes[5:8], byteorder='little', signed=False),
        'timestamp': int.from_bytes(event_bytes[8:16], byteorder='little', signed=False)
    }

def parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale, cpu_clock_scale):
    gpu_event_file_path = os.path.join(npkit_dump_dir, 'gpu_events_rank_%d_buf_%d' % (rank, buf_idx))
    raw_event_size = 16
    curr_cpu_base_time = None
    curr_gpu_base_time = None
    gpu_events = []
    event_type_to_seq = {}
    mstack = Stack()
    
    #print("open gpu file:", gpu_event_file_path, "rank:", rank, "buf_idx:", buf_idx, "gpu_clock_scale", gpu_clock_scale, "cpu_clock_scale", cpu_clock_scale)
    with open(gpu_event_file_path, 'rb') as f:
        raw_content = f.read()
        raw_content_size = len(raw_content)
        raw_content_idx = 0
        while raw_content_idx < raw_content_size:
            parsed_gpu_event = parse_gpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
            gpu_event_count[rank][buf_idx] += 1
            #if rank == 0: print("parsed id:", parsed_gpu_event['id'], "size:",parsed_gpu_event['size'], "rsvd:", parsed_gpu_event['rsvd'], "timestamp:", parsed_gpu_event['timestamp'])
            if npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_CPU':
                if curr_cpu_base_time is None:
                    curr_cpu_base_time = parsed_gpu_event['timestamp'] / 1000
                    #curr_gpu_base_time = None
            elif npkit_event_def['id_to_type'][parsed_gpu_event['id']] == 'NPKIT_EVENT_TIME_SYNC_GPU':
                if curr_gpu_base_time is None:
                    curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
            else:
                if curr_gpu_base_time is None:
                    curr_gpu_base_time = parsed_gpu_event['timestamp'] / gpu_clock_scale
                event_type = npkit_event_def['id_to_type'][parsed_gpu_event['id']]
                phase = 'B' if event_type.endswith('_ENTRY') else 'E'
                ts_time_us = curr_cpu_base_time + parsed_gpu_event['timestamp'] / gpu_clock_scale - curr_gpu_base_time
                #if rank == 0: print(f"parsed rank:{rank} buf:{buf_idx} id:{parsed_gpu_event['id']} size:{parsed_gpu_event['size']} gpu_clock:{gpu_clock_scale} ts:{ts_time_us}")
                gpu_events.append({
                    'ph': phase,
                    'ts': ts_time_us, # time unit is usec
                    'pid': rank,
                    'tid': buf_idx + 1
                })
                if phase == 'B':
                    if event_type not in event_type_to_seq:
                        event_type_to_seq[event_type] = 0
                    gpu_events[-1].update({
                        'name': event_type,
                        'cat': 'GPU',
                        'args': {
                            'rank': rank,
                            'buf_idx': buf_idx,
                            'seq': event_type_to_seq[event_type],
                            'rsvd_0': parsed_gpu_event['rsvd'],
                            'size_0': parsed_gpu_event['size']
                        }
                    })
                    event_type_to_seq[event_type] += 1
                    mstack.push(ts_time_us)  
                    #print(f"--push--rank:{rank} buf_idx:{buf_idx} cur_id:{parsed_gpu_event['id']} ts_time_us:{ts_time_us}")
                else:
                    gpu_events[-1]['args'] = {'size': parsed_gpu_event['size'], 'rsvd': parsed_gpu_event['rsvd']}

                    prev_time = mstack.pop()
                    #print(f"==pop==rank:{rank} buf_idx:{buf_idx} cur_id:{parsed_gpu_event['id']} prev_time:{prev_time}")
                    delta_time = gpu_events[-1]['ts'] - prev_time
                    data_time = parsed_gpu_event['rsvd'] / gpu_clock_scale

                    gpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else gpu_events[-1]['args']['size'] / delta_time / 1e3

                    cur_id = parsed_gpu_event['id']
                    if cur_id in op_group: 
                        gpu_op_count[rank][buf_idx] += 1 
                    if check_op(gpu_op_count[rank][buf_idx]):
                        if gpu_io_size[rank][buf_idx][cur_id] == 0: 
                            gpu_io_size[rank][buf_idx][cur_id] = parsed_gpu_event['size']
                        gpu_io_count[rank][buf_idx][cur_id] += 1
                        gpu_sum_tm[rank][buf_idx][cur_id] += delta_time
                        gpu_sum_dt[rank][buf_idx][cur_id] += data_time
                        gpu_sum_size[rank][buf_idx][cur_id] += parsed_gpu_event['size']
                        #if rank == 0: print(f"rank:{rank} buf_idx:{buf_idx} {npkit_event_def['id_to_type'][cur_id]}  {round(delta_time, 3):10} op_count:{gpu_op_count[rank][buf_idx]}")
                    #if rank == 0: print(f"parsed rank:{rank} buf:{buf_idx} id:{cur_id} prev:{prev_time} now:{gpu_events[-1]['ts']} delta:{round(delta_time, 3)} dcost:{data_time}")
            raw_content_idx += raw_event_size

    if show_rank_buf == 1: 
        print("------------------------------------------------- show rank:", rank, "buf:", buf_idx, "-------------------------------------------------")
        print("%22s %51s %10s %10s %10s %10s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "AvgDt(us)", "BW(GB/s)", "Count"))
    for i in range(MAX_ID):
        if gpu_io_count[rank][buf_idx][i] == 0: continue 
        gpu_avg_tm[rank][buf_idx][i] = gpu_sum_tm[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i]
        gpu_avg_dt[rank][buf_idx][i] = gpu_sum_dt[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i]
        if gpu_sum_tm[rank][buf_idx][i]: gpu_avg_bw[rank][buf_idx][i] = gpu_sum_size[rank][buf_idx][i] / gpu_sum_tm[rank][buf_idx][i] / 1e3
        gpu_avg_iosize[rank][buf_idx][i] = gpu_sum_size[rank][buf_idx][i] / gpu_io_count[rank][buf_idx][i]

        if gpu_io_size[rank][buf_idx][i] * gpu_io_count[rank][buf_idx][i] != gpu_sum_size[rank][buf_idx][i] and rank == 0 and buf_idx == 0:
            print(f"===note===gpu has diff io size rank:{rank} buf:{buf_idx} id:{i} "
                  f"sum_iocount_size:{gpu_io_size[rank][buf_idx][i] * gpu_io_count[rank][buf_idx][i]} sum_size:{gpu_sum_size[rank][buf_idx][i]} "
                  f"record_io_size:{gpu_io_size[rank][buf_idx][i]} avg_io_size:{int(gpu_avg_iosize[rank][buf_idx][i])}")

        if show_rank_buf == 1: 
            print(f"[{i:2}]{npkit_event_def['id_to_type'][i]:70} {gpu_io_size[rank][buf_idx][i]:10} {round(gpu_avg_tm[rank][buf_idx][i], 3):10}"
                f" {round(gpu_avg_dt[rank][buf_idx][i], 3):10} {round(gpu_avg_bw[rank][buf_idx][i], 3):10} {gpu_io_count[rank][buf_idx][i]:6}")

    if mstack.size() != 0: 
        print(f"===Warning===gpu rank:{rank} buf_idx:{buf_idx} invalid stack size:{mstack.size()}! event_num:{gpu_event_count[rank][buf_idx]} max:{MAX_GPU_BUF_EVENT_NUM}")
    return gpu_events

def check_op(op_count):
    if rccl_normal_test == 0:
        return 1
    
    if (op_count >= OUT_START1 and op_count <= OUT_END1) or (op_count >= IN_START1 and op_count <= IN_END1):
        return 1
    else:
        return 0

def get_all_bw(event_type, channel_bw, nbuf):
    if "ALGO" in event_type:
        return channel_bw
    else:
        return channel_bw * nbuf

def show_result(npkit_event_def, nrank, nbuf):
    for i in range(nrank):
        for j in range(nbuf):
            if gpu_op_count[i][j] != ALL_NUM and rccl_normal_test: print(f"===Warning===gpu rank:{i} buf_idx:{j} invalid gpu_op_count:{gpu_op_count[i][j]} ALL:{ALL_NUM}")

    for i in range(MAX_ID):
        for j in range(nrank):
            sum_tm = 0
            sum_dt = 0
            sum_bw = 0
            sum_io = 0
            for k in range(nbuf):
                sum_tm += gpu_avg_tm[j][k][i]
                sum_dt += gpu_avg_dt[j][k][i]
                sum_bw += gpu_avg_bw[j][k][i]
                sum_io += gpu_avg_iosize[j][k][i]
            if sum_tm > 0:
                gpu_rank_avg_tm[j][i] = sum_tm / nbuf
                gpu_rank_avg_dt[j][i] = sum_dt / nbuf
                gpu_rank_avg_bw[j][i] = sum_bw / nbuf
                gpu_rank_avg_iosize[j][i] = sum_io / nbuf

    print(f"\nNOTE:\n INPUT: get data from user input buff\n  RECV: get data from prev gpu\nREDUCE: do data reduce operate\n  SEND: send data to next gpu\nOUTPUT: send data to user output buff")
    
    for i in range(nrank):
        print(" ")
        print("========================================================== gpu show rank:", i, "buf_num:", nbuf, "==========================================================")
        print("%23s %51s %10s %10s %10s %10s %13s %12s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "AvgDt(us)", "Dt/Tm(%)", "ChanBW(GB/s)", "AllBW(GB/s)", "Count"))
        for j in range(MAX_ID):
            if gpu_rank_avg_tm[i][j] > 0:
                #io_size = gpu_io_size[0][0][j] if gpu_io_size[0][0][j] else gpu_io_size[0][1][j] 
                io_size = int(gpu_rank_avg_iosize[i][j])
                io_count = gpu_io_count[i][0][j] if gpu_io_count[i][0][j] else gpu_io_count[i][1][j]
                print(f"[{j:3}]{npkit_event_def['id_to_type'][j]:70} {io_size:10} {round(gpu_rank_avg_tm[i][j], 3):10}"
                    f"{round(gpu_rank_avg_dt[i][j], 3):11} {round(gpu_rank_avg_dt[i][j] * 100 /gpu_rank_avg_tm[i][j], 1):10} {round(gpu_rank_avg_bw[i][j], 3):13}"
                    f"{round(get_all_bw(npkit_event_def['id_to_type'][j], gpu_rank_avg_bw[i][j], nbuf), 3):12} {io_count:6}")
                
    for i in range(nrank):
        for j in range(MAX_ID):
            if cpu_count[i][j] == 0: continue 
            cpu_avg_tm[i][j] = cpu_sum_tm[i][j] / cpu_count[i][j]
            cpu_avg_bw[i][j] = cpu_sum_size[i][j] / cpu_sum_tm[i][j] / 1e3
            if cpu_count[i][j] * cpu_io_size[i][j] != cpu_sum_size[i][j]:
                print(f"===Warning===cpu invalid sum size rank:{i} id:{j} sum_iocount_size:{cpu_io_size[i][j] * cpu_count[i][j]} sum_size:{cpu_sum_size[i][j]}")

    for i in range(nrank):
        print(" ")
        print("================================================= cpu show rank:", i, "=================================================")
        print("%23s %51s %10s %10s %10s %6s" % ("Iterm", " ", "IOSize", "AvgTm(us)", "BW(GB/s)", "Count"))
        for j in range(MAX_ID):
            if cpu_avg_tm[i][j] > 0:
                print(f"[{j:3}]{npkit_event_def['id_to_type'][j]:70} {cpu_io_size[i][j]:10} {round(cpu_avg_tm[i][j], 3):10} {round(cpu_avg_bw[i][j], 3):10} {cpu_count[i][j]:6}")
    print(" ")

def parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale):
    cpu_event_file_path = os.path.join(npkit_dump_dir, 'cpu_events_rank_%d_channel_%d' % (rank, channel))
    raw_event_size = 16
    cpu_events = []
    event_type_to_seq = {}

    fiber_is_usable = []
    fiber_open_ts = []
    slot_to_fiber_id = {}
    channel_shift = 1000

    with open(cpu_event_file_path, 'rb') as f:
        raw_content = f.read()
        raw_content_size = len(raw_content)
        raw_content_idx = 0
        while raw_content_idx < raw_content_size:
            parsed_cpu_event = parse_cpu_event(raw_content[raw_content_idx : raw_content_idx + raw_event_size])
            #print("parsed cpu id:", parsed_cpu_event['id'], "timestamp:", parsed_cpu_event['timestamp'], "ts:", parsed_cpu_event['timestamp'] / 1000)
            event_type = npkit_event_def['id_to_type'][parsed_cpu_event['id']]
            phase = 'B' if event_type.endswith('_ENTRY') else 'E'
            cpu_events.append({
                'ph': phase,
                'ts': parsed_cpu_event['timestamp'] / 1000, # time unit is usec
                'pid': rank
            })
            slot = parsed_cpu_event['slot']
            if phase == 'B':
                # Open fiber event
                fiber_id = 0
                while fiber_id < len(fiber_is_usable):
                    if fiber_is_usable[fiber_id]:
                        break
                    fiber_id += 1
                if fiber_id == len(fiber_is_usable):
                    fiber_is_usable.append(True)
                    fiber_open_ts.append(0.0)
                slot_to_fiber_id[slot] = fiber_id
                fiber_open_ts[fiber_id] = cpu_events[-1]['ts']
                fiber_is_usable[fiber_id] = False

                if event_type not in event_type_to_seq:
                    event_type_to_seq[event_type] = 0
                cpu_events[-1].update({
                    'name': event_type,
                    'cat': 'CPU',
                    'args': {
                        'rank': rank,
                        'channel': channel,
                        'slot': parsed_cpu_event['slot'],
                        'seq': event_type_to_seq[event_type],
                        'size_0': parsed_cpu_event['size']
                    }
                })
                event_type_to_seq[event_type] += 1
            else:
                # Close fiber event
                fiber_id = slot_to_fiber_id[slot]
                slot_to_fiber_id.pop(slot)
                last_ts = fiber_open_ts[fiber_id]
                fiber_is_usable[fiber_id] = True

                delta_time = max(0.001, cpu_events[-1]['ts'] - last_ts)
                cpu_events[-1]['args'] = {'size': parsed_cpu_event['size']}
                cpu_events[-1]['args']['bw (GB/s)'] = 0. if delta_time == 0. else cpu_events[-1]['args']['size'] / delta_time / 1e3

                cur_id = parsed_cpu_event['id']
                cpu_count[rank][cur_id] += 1
                cpu_sum_tm[rank][cur_id] += delta_time
                cpu_sum_size[rank][cur_id] += parsed_cpu_event['size']
                if cpu_io_size[rank][cur_id] == 0: 
                    cpu_io_size[rank][cur_id] = parsed_cpu_event['size']
                elif parsed_cpu_event['size'] != cpu_io_size[rank][cur_id]:
                    print(f"===Warning===cpu rank:{rank} id:{cur_id} invaid io szie:{parsed_cpu_event['size']} recode io size:{cpu_io_size[rank][cur_id]}")
                #if cur_id == 48: print(f"{event_type:30} cpu_count:{cpu_count[rank][cur_id]:8}  iosize:{parsed_cpu_event['size']:8}  delta_time:{round(delta_time, 3):8}  bw:{round(cpu_events[-1]['args']['bw (GB/s)'], 3):6}")

            cpu_events[-1]['tid'] = fiber_id + (channel + 1) * channel_shift
            raw_content_idx += raw_event_size

    return cpu_events

def convert_npkit_dump_to_trace(npkit_dump_dir, output_dir, npkit_event_def):
    files_in_dump_dir = next(os.walk(npkit_dump_dir))[2]
    gpu_event_files = [x for x in files_in_dump_dir if x.startswith('gpu_events_rank_')]
    cpu_event_files = [x for x in files_in_dump_dir if x.startswith('cpu_events_rank_')]

    ranks = list(set([int(x.split('_rank_')[1].split('_')[0]) for x in gpu_event_files]))
    buf_indices = list(set([int(x.split('_buf_')[1].split('_')[0]) for x in gpu_event_files]))
    channels = list(set([int(x.split('_channel_')[1].split('_')[0]) for x in cpu_event_files]))
    print(f"mode:{rccl_normal_test} rank_num:{len(ranks)} buf_num:{len(buf_indices)} OUT_START1:{OUT_START1} OUT_END1:{OUT_END1} IN_START1:{IN_START1} IN_END1:{IN_END1} ALL_NUM:{ALL_NUM}")
    trace = {'traceEvents': []}

    for rank in ranks:
        cpu_clock_den_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_den_rank_%d' % rank)
        cpu_clock_num_file_path = os.path.join(npkit_dump_dir, 'cpu_clock_period_num_rank_%d' % rank)
        cpu_clock_scale = parse_cpu_clock_scale(cpu_clock_den_file_path, cpu_clock_num_file_path)

        gpu_clock_file_path = os.path.join(npkit_dump_dir, 'gpu_clock_rate_rank_%d' % rank)
        gpu_clock_scale_mhz = parse_gpu_clock_scale(gpu_clock_file_path)
        if gpu_clock_scale_mhz == 0:
            print(f"===Warning===gpu rank:{rank} gpu clock from file is zero, use default clock rate:25 mhz")
            gpu_clock_scale_mhz = 25
        print(f"rank:{rank} gpu_clock_scale:{gpu_clock_scale_mhz}")

        for buf_idx in buf_indices:
            gpu_events = parse_gpu_event_file(npkit_dump_dir, npkit_event_def, rank, buf_idx, gpu_clock_scale_mhz, cpu_clock_scale)
            trace['traceEvents'].extend(gpu_events)

        for channel in channels:
            cpu_events = parse_cpu_event_file(npkit_dump_dir, npkit_event_def, rank, channel, cpu_clock_scale)
            trace['traceEvents'].extend(cpu_events)

    trace['traceEvents'].sort(key=lambda x : x['ts'])
    trace['displayTimeUnit'] = 'ns'

    current_path = os.path.dirname(os.path.abspath(__file__))  
    display_file_path = os.path.join(current_path, 'npkit_event_display.h')  
    display_event_def = parse_npkit_event_header(display_file_path)
    show_result(display_event_def, len(ranks), len(buf_indices))

    if need_dump_json == 1:
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, 'npkit_event_trace.json'), 'w') as f:
            json.dump(trace, f)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--npkit_dump_dir', type=str, required=True, help='NPKit dump directory.')
    parser.add_argument('--npkit_event_header_path', type=str, required=True, help='Path to npkit_event.h.')
    parser.add_argument('--output_dir', type=str, required=False, help='Path to output directory.')
    parser.add_argument('--rccl_normal_test', type=int, required=False, help='Rccl normal test mode.')
    args = parser.parse_args()
    if args.output_dir is None: 
        args.output_dir = args.npkit_dump_dir
        print(f"output_dir:{args.output_dir}")
    if args.rccl_normal_test is not None:  
        rccl_normal_test = args.rccl_normal_test

    npkit_event_def = parse_npkit_event_header(args.npkit_event_header_path)
    convert_npkit_dump_to_trace(args.npkit_dump_dir, args.output_dir, npkit_event_def)