#ifndef _TIMELINE_H_
#define _TIMELINE_H_

#include <cstdint>
#include <mutex>
#include <thread>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <cstddef>
#include <string>
#include "nccl.h"

#include "timeline/timeline_struct.h"
#include "timeline/timeline_event.h"


class Timeline {
  public:
    Timeline();
    bool isDumped = false;

    ncclResult_t Init(ncclComm_t comm);

    ncclResult_t Shutdown();

    TimelineCpuEventContext* GetCpuEventContext();
    TimelineGpuEventContext* GetGpuEventContext();

    static ncclResult_t Dump(ncclComm_t comm);
    
    static uint64_t* GetCpuTimestamp();

    static inline __device__ void CollectGpuEvent(TimelineGpuEventContext* ctx, uint8_t type, uint16_t funcIndex, uint64_t* timestamp) {
      if (ctx == nullptr) return;
      if (ctx->isFull) return;
      
      int tid = threadIdx.x;
      int bid = blockIdx.x;
      
      if (tid >= ctx->gpuMaxProfilingThreads) return;
      if (ctx->skipped < ctx->skip) {
        if (type == TIMELINE_EVENT_COLL_EXIT || type == TIMELINE_EVENT_P2P_EXIT) {
          if (bid == 0 && tid == 0) {
            ctx->skipped++;
            ctx->curOpCount++;
          }
        }
        return;
      }
      TimelineGpuEventHandle* handle = ctx->handle + (bid * ctx->gpuMaxProfilingThreads + tid);
      uint64_t event_buffer_head = handle->event_buffer_head;
      if (event_buffer_head < ctx->gpuMaxProfilingEvents) {
        TimelineGpuEvent& event = handle->event_buffer[event_buffer_head];
        event.type = type;
        event.opCount = ctx->curOpCount;
        event.funcIndex = funcIndex;
        event.timestamp = *timestamp;
        event.gpuTimestamp = __builtin_amdgcn_s_memrealtime();
        handle->event_buffer_head++;
      }
      if (bid == 0 && tid == 0 && (type == TIMELINE_EVENT_COLL_EXIT || type == TIMELINE_EVENT_P2P_EXIT)) {
        TimelineGpuEvent& event = handle->event_buffer[event_buffer_head];
        // printf("funcIndex: %u, opCount: %d\n", event.funcIndex, event.opCount);
        if (event_buffer_head >= ctx->gpuMaxProfilingEvents) ctx->isFull = 1;
        ctx->curOpCount++;
      }
    }

    static inline __device__ void CollectGpuPrimEvent(TimelineGpuEventContext* ctx, uint8_t type, size_t size, uint64_t gpuTimestamp, uint64_t* timestamp) {
      if (ctx == nullptr) return;
      if (ctx->isFull) return;
      if (ctx->skipped < ctx->skip) return;
      int tid = threadIdx.x;
      int bid = blockIdx.x;
      
      if (tid >= ctx->gpuMaxProfilingThreads) return;
      TimelineGpuEventHandle* handle = ctx->handle + (bid * ctx->gpuMaxProfilingThreads + tid);
      uint64_t event_buffer_head = handle->event_buffer_head;
      if (event_buffer_head < ctx->gpuMaxProfilingEvents) {
        TimelineGpuEvent& event = handle->event_buffer[event_buffer_head];
        event.type = type;
        event.size = size;
        event.gpuTimestamp = gpuTimestamp;
        event.timestamp = *timestamp;
        handle->event_buffer_head++;
      }
    }

    static ncclResult_t CollectCpuEvent(uint64_t beginTimestamp, uint64_t endTimestamp, struct ncclInfo* info, struct ncclKernelPlan* plan);
  private:
    uint64_t rank_ = -1;
    uint64_t nChannel_ = 0;
    ncclComm_t comm_ = nullptr;
    long hostIsExport;
    TimelineGpuEventContext* gpu_event_context_ = nullptr;

    TimelineCpuEventContext* cpu_event_context_ = nullptr;

    static uint obj_num;
    static std::mutex mtx;

    std::thread* dump_event_thread_ = nullptr;
    // volatile bool dump_event_thread_should_stop_;
    // static void DumpTimelineEventThread(ncclComm_t comm);

    static uint64_t* cpu_timestamp_;
    static std::thread* cpu_timestamp_update_thread_;
    static volatile bool cpu_timestamp_update_thread_should_stop_;
    static void CpuTimestampUpdateThread();
};

#endif // _TIMELINE_H_