/************************************************************************* * Copyright (c) Microsoft Corporation. * Licensed under the MIT License. ************************************************************************/ #ifndef NPKIT_H_ #define NPKIT_H_ #include #include #include #include "npkit/npkit_event.h" #include "npkit/npkit_struct.h" #include "common.h" #include #include #define NPKIT_GET_GPU_TIMESTAMP wall_clock64 #define RANK_NUM 16 #define CHANNEL_NUM 32 #define HOST_SUBMIT_CHANNEL_BUF 32 class NpKit { public: static const uint64_t kNumGpuEventBuffers = CHANNEL_NUM; static const uint64_t kNumCpuEventBuffers = CHANNEL_NUM + 1; static ncclResult_t Init(int rank); static ncclResult_t Dump(const std::string& dump_dir, int rank); static ncclResult_t Shutdown(int rank); static NpKitEventCollectContext* GetGpuEventCollectContexts(int rank); static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, NpKitEventCollectContext* ctx) { uint64_t event_buffer_head = ctx->event_buffer_head; if (event_buffer_head < kMaxNumGpuEventsPerBuffer) { NpKitEvent& event = ctx->event_buffer[event_buffer_head]; event.fields.type = type; event.fields.size = size < 0 ? 0 : size; event.fields.rsvd = rsvd; event.fields.timestamp = timestamp; ctx->event_buffer_head++; } } static inline __device__ void CollectGpuEventLDS(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp) { #if defined(ENABLE_NPKIT) if (ncclShmem.event_buffer_head < LDS_NUM_EVENTS) { NpKitEvent& event = ncclShmem.event_buffer[ncclShmem.event_buffer_head]; event.fields.type = type; event.fields.size = size < 0 ? 0 : size; event.fields.rsvd = rsvd; event.fields.timestamp = timestamp; ncclShmem.event_buffer_head++; } #endif } static void CollectCpuEvent(int rank, uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id); static uint64_t *GetCpuTimestamp(); static uint64_t GetCpuTimeNs(); private: static void CpuTimestampUpdateThread(); // 1M * 32 * 16B = 512MB per GPU static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 20; // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21; static NpKitEvent** gpu_event_buffers_[RANK_NUM]; static NpKitEvent** cpu_event_buffers_[RANK_NUM]; static int gpu_rtc_rate_khz[RANK_NUM]; static NpKitEventCollectContext* gpu_collect_contexts_[RANK_NUM]; static NpKitEventCollectContext* cpu_collect_contexts_[RANK_NUM]; static uint64_t* cpu_timestamp_; static pthread_mutex_t npKitLock; static uint64_t rank_; static std::thread* cpu_timestamp_update_thread_; static volatile bool cpu_timestamp_update_thread_should_stop_; }; #endif