"csrc/gfx93/decode/dense/splitkv_mla.h" did not exist on "c28eca99dbc664dd2716415ed03492afe5fefade"
runtime.cu 3.08 KB
Newer Older
lijian6's avatar
lijian6 committed
1
#include "hip/hip_runtime.h"
Chenggang Zhao's avatar
Chenggang Zhao committed
2
3
4
5
6
7
#include <cstring>

#include "configs.cuh"
#include "exception.cuh"
#include "launch.cuh"
#include "utils.cuh"
8

lijian6's avatar
lijian6 committed
9
10
#ifndef DISABLE_ROCSHMEM
#include <rocshmem/rocshmem.hpp>
11
#endif
Chenggang Zhao's avatar
Chenggang Zhao committed
12
13
14
15
namespace deep_ep {

namespace intranode {

lijian6's avatar
lijian6 committed
16
17
template <int kNumRanks> 
__global__ void barrier(int **barrier_signal_ptrs, int rank) {
18
    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
Chenggang Zhao's avatar
Chenggang Zhao committed
19
20
}

lijian6's avatar
lijian6 committed
21
22
23
void barrier(int **barrier_signal_ptrs, int rank, int num_ranks, hipStream_t stream) {
#define BARRIER_LAUNCH_CASE(ranks)                                                                 \
    LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank);                                \
Chenggang Zhao's avatar
Chenggang Zhao committed
24
25
    break

lijian6's avatar
lijian6 committed
26
    SETUP_LAUNCH_CONFIG(1, kWarpSize, stream);
Chenggang Zhao's avatar
Chenggang Zhao committed
27
28
29
30
31
32
33
34
    SWITCH_RANKS(BARRIER_LAUNCH_CASE);
#undef BARRIER_LAUNCH_CASE
}

} // namespace intranode

namespace internode {

lijian6's avatar
lijian6 committed
35
36
37
#ifndef DISABLE_ROCSHMEM
rocshmem::rocshmem_team_t        cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
rocshmem::rocshmem_team_config_t cpu_rdma_team_config;
Chenggang Zhao's avatar
Chenggang Zhao committed
38
39

std::vector<uint8_t> get_unique_id() {
lijian6's avatar
lijian6 committed
40
41
42
43
    rocshmem::rocshmem_uniqueid_t unique_id;
    rocshmem::rocshmem_get_uniqueid(&unique_id);
    std::vector<uint8_t> result(sizeof(rocshmem::rocshmem_uniqueid_t));
    std::memcpy(result.data(), &unique_id, sizeof(rocshmem::rocshmem_uniqueid_t));
Chenggang Zhao's avatar
Chenggang Zhao committed
44
45
46
    return result;
}

lijian6's avatar
lijian6 committed
47
48
49
50
51
52
53
int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks,
         bool low_latency_mode) {
    rocshmem::rocshmem_uniqueid_t  root_unique_id;
    rocshmem::rocshmem_init_attr_t attr;
    std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(rocshmem::rocshmem_uniqueid_t));
    rocshmem::rocshmem_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
    rocshmem::rocshmem_init_attr(rocshmem::ROCSHMEM_INIT_WITH_UNIQUEID, &attr);
Chenggang Zhao's avatar
Chenggang Zhao committed
54
55
56
57

    // Create sub-RDMA teams
    // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
    if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
lijian6's avatar
lijian6 committed
58
        EP_HOST_ASSERT(cpu_rdma_team == rocshmem::ROCSHMEM_TEAM_INVALID);
Chenggang Zhao's avatar
Chenggang Zhao committed
59
        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
lijian6's avatar
lijian6 committed
60
61
62
63
64
        EP_HOST_ASSERT(rocshmem::rocshmem_team_split_strided(
                               rocshmem::ROCSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS,
                               NUM_MAX_NVL_PEERS, num_ranks / NUM_MAX_NVL_PEERS,
                               &cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
        EP_HOST_ASSERT(cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID);
Chenggang Zhao's avatar
Chenggang Zhao committed
65
66
    }

lijian6's avatar
lijian6 committed
67
68
    rocshmem::rocshmem_barrier_all();
    return rocshmem::rocshmem_my_pe();
Chenggang Zhao's avatar
Chenggang Zhao committed
69
70
}

lijian6's avatar
lijian6 committed
71
72
73
void *alloc(size_t size, size_t alignment) {
    auto alloc_size = ALIGN(size, alignment);
    return rocshmem::rocshmem_malloc(alloc_size);
Chenggang Zhao's avatar
Chenggang Zhao committed
74
75
}

lijian6's avatar
lijian6 committed
76
77
void free(void *ptr) {
    rocshmem::rocshmem_free(ptr);
Chenggang Zhao's avatar
Chenggang Zhao committed
78
79
80
}

void barrier() {
lijian6's avatar
lijian6 committed
81
    rocshmem::rocshmem_barrier_all();
Chenggang Zhao's avatar
Chenggang Zhao committed
82
83
84
}

void finalize() {
lijian6's avatar
lijian6 committed
85
86
87
    if (cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID) {
        rocshmem::rocshmem_team_destroy(cpu_rdma_team);
        cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
Chenggang Zhao's avatar
Chenggang Zhao committed
88
    }
lijian6's avatar
lijian6 committed
89
    rocshmem::rocshmem_finalize();
Chenggang Zhao's avatar
Chenggang Zhao committed
90
}
91
#endif
Chenggang Zhao's avatar
Chenggang Zhao committed
92
93
94
95

} // namespace internode

} // namespace deep_ep