npkit.h 2.92 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/*************************************************************************
 * Copyright (c) Microsoft Corporation.
 * Licensed under the MIT License.
 ************************************************************************/

#ifndef NPKIT_H_
#define NPKIT_H_

#include <string>
#include <thread>

#include <hip/hip_runtime.h>

#include "npkit/npkit_event.h"
#include "npkit/npkit_struct.h"
#include "common.h"

#include <time.h>
#include <sys/time.h>

#define NPKIT_GET_GPU_TIMESTAMP wall_clock64
#define RANK_NUM     16
#define CHANNEL_NUM  32
#define HOST_SUBMIT_CHANNEL_BUF  32

class NpKit {
 public:
  static const uint64_t kNumGpuEventBuffers = CHANNEL_NUM;

  static const uint64_t kNumCpuEventBuffers = CHANNEL_NUM + 1;

  static ncclResult_t Init(int rank);

  static ncclResult_t Dump(const std::string& dump_dir, int rank);

  static ncclResult_t Shutdown(int rank);

  static NpKitEventCollectContext* GetGpuEventCollectContexts(int rank);

  static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp,
                                                NpKitEventCollectContext* ctx) {
    uint64_t event_buffer_head = ctx->event_buffer_head;
    if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
      NpKitEvent& event = ctx->event_buffer[event_buffer_head];
      event.fields.type = type;
      event.fields.size = size < 0 ? 0 : size;
      event.fields.rsvd = rsvd;
      event.fields.timestamp = timestamp;
      ctx->event_buffer_head++;
    }
  }

  static inline __device__ void CollectGpuEventLDS(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp) {
#if defined(ENABLE_NPKIT)
    if (ncclShmem.event_buffer_head < LDS_NUM_EVENTS) {
      NpKitEvent& event = ncclShmem.event_buffer[ncclShmem.event_buffer_head];
      event.fields.type = type;
      event.fields.size = size < 0 ? 0 : size;
      event.fields.rsvd = rsvd;
      event.fields.timestamp = timestamp;
      ncclShmem.event_buffer_head++;
    }
#endif
  }

  static void CollectCpuEvent(int rank, uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);

  static uint64_t *GetCpuTimestamp();
  static uint64_t GetCpuTimeNs();

 private:
  static void CpuTimestampUpdateThread();

  // 1M * 32 * 16B = 512MB per GPU
  static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 20;

  // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
  static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;

  static NpKitEvent** gpu_event_buffers_[RANK_NUM];
  static NpKitEvent** cpu_event_buffers_[RANK_NUM];
  static int gpu_rtc_rate_khz[RANK_NUM];

  static NpKitEventCollectContext* gpu_collect_contexts_[RANK_NUM];
  static NpKitEventCollectContext* cpu_collect_contexts_[RANK_NUM];
  static uint64_t* cpu_timestamp_;
  static pthread_mutex_t npKitLock;

  static uint64_t rank_;

  static std::thread* cpu_timestamp_update_thread_;
  static volatile bool cpu_timestamp_update_thread_should_stop_;
};

#endif