#ifndef _TIMELINE_H_ #define _TIMELINE_H_ #include #include #include #include #include #include #include #include #include "nccl.h" #include "timeline/timeline_struct.h" #include "timeline/timeline_event.h" class Timeline { public: Timeline(); bool isDumped = false; ncclResult_t Init(ncclComm_t comm); ncclResult_t Shutdown(); TimelineCpuEventContext* GetCpuEventContext(); TimelineGpuEventContext* GetGpuEventContext(); static ncclResult_t Dump(ncclComm_t comm); static uint64_t* GetCpuTimestamp(); static inline __device__ void CollectGpuEvent(TimelineGpuEventContext* ctx, uint8_t type, uint16_t funcIndex, uint64_t* timestamp) { if (ctx == nullptr) return; if (ctx->isFull) return; int tid = threadIdx.x; int bid = blockIdx.x; if (tid >= ctx->gpuMaxProfilingThreads) return; if (ctx->skipped < ctx->skip) { if (type == TIMELINE_EVENT_COLL_EXIT || type == TIMELINE_EVENT_P2P_EXIT) { if (bid == 0 && tid == 0) { ctx->skipped++; ctx->curOpCount++; } } return; } TimelineGpuEventHandle* handle = ctx->handle + (bid * ctx->gpuMaxProfilingThreads + tid); uint64_t event_buffer_head = handle->event_buffer_head; if (event_buffer_head < ctx->gpuMaxProfilingEvents) { TimelineGpuEvent& event = handle->event_buffer[event_buffer_head]; event.type = type; event.opCount = ctx->curOpCount; event.funcIndex = funcIndex; event.timestamp = *timestamp; event.gpuTimestamp = __builtin_amdgcn_s_memrealtime(); handle->event_buffer_head++; } if (bid == 0 && tid == 0 && (type == TIMELINE_EVENT_COLL_EXIT || type == TIMELINE_EVENT_P2P_EXIT)) { TimelineGpuEvent& event = handle->event_buffer[event_buffer_head]; // printf("funcIndex: %u, opCount: %d\n", event.funcIndex, event.opCount); if (event_buffer_head >= ctx->gpuMaxProfilingEvents) ctx->isFull = 1; ctx->curOpCount++; } } static inline __device__ void CollectGpuPrimEvent(TimelineGpuEventContext* ctx, uint8_t type, size_t size, uint64_t gpuTimestamp, uint64_t* timestamp) { if (ctx == nullptr) return; if (ctx->isFull) return; if (ctx->skipped < ctx->skip) return; int tid = threadIdx.x; int bid = blockIdx.x; if (tid >= ctx->gpuMaxProfilingThreads) return; TimelineGpuEventHandle* handle = ctx->handle + (bid * ctx->gpuMaxProfilingThreads + tid); uint64_t event_buffer_head = handle->event_buffer_head; if (event_buffer_head < ctx->gpuMaxProfilingEvents) { TimelineGpuEvent& event = handle->event_buffer[event_buffer_head]; event.type = type; event.size = size; event.gpuTimestamp = gpuTimestamp; event.timestamp = *timestamp; handle->event_buffer_head++; } } static ncclResult_t CollectCpuEvent(uint64_t beginTimestamp, uint64_t endTimestamp, struct ncclInfo* info, struct ncclKernelPlan* plan); private: uint64_t rank_ = -1; uint64_t nChannel_ = 0; ncclComm_t comm_ = nullptr; long hostIsExport; TimelineGpuEventContext* gpu_event_context_ = nullptr; TimelineCpuEventContext* cpu_event_context_ = nullptr; static uint obj_num; static std::mutex mtx; std::thread* dump_event_thread_ = nullptr; // volatile bool dump_event_thread_should_stop_; // static void DumpTimelineEventThread(ncclComm_t comm); static uint64_t* cpu_timestamp_; static std::thread* cpu_timestamp_update_thread_; static volatile bool cpu_timestamp_update_thread_should_stop_; static void CpuTimestampUpdateThread(); }; #endif // _TIMELINE_H_