"csrc/moe/topk_softmax_kernels.cu" did not exist on "2ca8867f0322aac5927d6b6741619ec36349c7ac"
p2p_latency_test.cpp 4.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*************************************************************************
 * Copyright (c) Microsoft Corporation.
 * Licensed under the MIT License.
 ************************************************************************/

#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <fstream>
#include <string>
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <iostream> //cerr
#include <cstring>

#define HIP_IPC_MEM_MIN_SIZE 2097152UL

#define NUM_LOOPS_WARMUP 2000
#define NUM_LOOPS_RUN 10000

#define PING_MODE 0
#define PONG_MODE 1

__global__ void PingKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) {
  #pragma unroll
  for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
    __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
    while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
  }
  uint64_t start_time = wall_clock64();
  #pragma unroll
  for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
    __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
    while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
  }
  uint64_t end_time = wall_clock64();
  *time_delta = end_time - start_time;
}

__global__ void PongKernel(uint64_t* local_flag, uint64_t* remote_flag, uint64_t* time_delta) {
  #pragma unroll
  for (uint32_t i = 1; i < NUM_LOOPS_WARMUP; i++) {
    while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
    __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
  }
  uint64_t start_time = wall_clock64();
  #pragma unroll
  for (uint32_t i = NUM_LOOPS_WARMUP; i <= NUM_LOOPS_WARMUP + NUM_LOOPS_RUN; i++) {
    while (__atomic_load_n(local_flag, __ATOMIC_RELAXED) != i);
    __atomic_store_n(remote_flag, i, __ATOMIC_RELAXED);
  }
  uint64_t end_time = wall_clock64();
  *time_delta = end_time - start_time;
}

#define HIPCHECK(cmd)                                                          \
do {                                                                           \
  hipError_t error = (cmd);                                                    \
  if (error != hipSuccess)                                                     \
  {                                                                            \
    std::cerr << "Encountered HIP error (" << error << ") at line "            \
              << __LINE__ << " in file " << __FILE__ << "\n";                  \
    exit(-1);                                                                  \
  }                                                                            \
} while (0)

int main(int argc, char** argv) {
  hipStream_t stream[2];
  hipError_t err = hipSuccess;
  int device_id[2];
  hipDeviceProp_t prop[2];

  if (argc != 3) {
    fprintf(stderr, "Usage: ./p2p_latency_test ping_dev_id pong_dev_id\n");
    return -1;
  }
  device_id[0] = atoi(argv[1]);
  device_id[1] = atoi(argv[2]);

  fprintf(stdout, "Using devices %d %d\n", device_id[0], device_id[1]);

  uint64_t *flag[2];
  uint64_t *time_delta[2];

  HIPCHECK(hipSetDevice(device_id[0]));
  HIPCHECK(hipStreamCreateWithFlags(&stream[0], hipStreamNonBlocking));
  HIPCHECK(hipDeviceEnablePeerAccess(device_id[1], 0));
  HIPCHECK(hipGetDeviceProperties(&prop[0], device_id[0]));
  HIPCHECK(hipExtMallocWithFlags((void**)&flag[0], HIP_IPC_MEM_MIN_SIZE, prop[0].gcnArch / 10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
  HIPCHECK(hipMalloc((void**)&time_delta[0], HIP_IPC_MEM_MIN_SIZE));
  HIPCHECK(hipMemsetAsync(flag[0], 0, HIP_IPC_MEM_MIN_SIZE, stream[0]));
  HIPCHECK(hipStreamSynchronize(stream[0]));

  HIPCHECK(hipSetDevice(device_id[1]));
  HIPCHECK(hipStreamCreateWithFlags(&stream[1], hipStreamNonBlocking));
  HIPCHECK(hipDeviceEnablePeerAccess(device_id[0], 0));
  HIPCHECK(hipGetDeviceProperties(&prop[1], device_id[1]));
  HIPCHECK(hipExtMallocWithFlags((void**)&flag[1], HIP_IPC_MEM_MIN_SIZE, prop[1].gcnArch / 10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
  HIPCHECK(hipMalloc((void**)&time_delta[1], HIP_IPC_MEM_MIN_SIZE));
  HIPCHECK(hipMemsetAsync(flag[1], 0, HIP_IPC_MEM_MIN_SIZE, stream[1]));
  HIPCHECK(hipStreamSynchronize(stream[1]));

  HIPCHECK(hipSetDevice(device_id[0]));
  PingKernel<<<1, 1, 0, stream[0]>>>(flag[0], flag[1], time_delta[0]);

  HIPCHECK(hipSetDevice(device_id[1]));
  PongKernel<<<1, 1, 0, stream[1]>>>(flag[1], flag[0], time_delta[1]);

  double vega_gpu_rtc_freq;

  HIPCHECK(hipStreamSynchronize(stream[0]));
  vega_gpu_rtc_freq = (prop[0].gcnArch / 10 == 94) ? 1.0E8 : 2.5E7;
  fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[0]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);

  HIPCHECK(hipStreamSynchronize(stream[1]));
  vega_gpu_rtc_freq = (prop[1].gcnArch / 10 == 94) ? 1.0E8 : 2.5E7;
  fprintf(stdout, "One-way latency in us: %g\n", double(*time_delta[1]) * 1e6 / NUM_LOOPS_RUN / vega_gpu_rtc_freq / 2);

  HIPCHECK(hipFree(flag[0]));
  HIPCHECK(hipFree(time_delta[0]));
  HIPCHECK(hipFree(flag[1]));
  HIPCHECK(hipFree(time_delta[1]));
  return 0;
}