rocm_wrap.cpp 6.06 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <dlfcn.h>
#include <sys/utsname.h>
#include <fstream>

#include "base.h"
#include "rocm_wrap.h"

namespace sccl {
namespace hardware {
namespace net {
namespace rocm_wrap {
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr

DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
DECLARE_ROCM_PFN(hsa_status_string);

SCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);

static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static scclResult_t initResult;

static void* hsaLib;
static uint16_t version_major, version_minor;
bool scclCudaLaunchBlocking = false;

//////////////////////////////////////////////////////////////////////////////
static void initOnceFunc() {
    do {
        char* val              = getenv("CUDA_LAUNCH_BLOCKING");
        scclCudaLaunchBlocking = val != nullptr && val[0] != 0 && !(val[0] == '0' && val[1] == 0);
    } while(0);

    bool dmaBufSupport = false;
    hsa_status_t res;

    /*
     * Load ROCr driver library
     */
    char path[1024];
    char* scclCudaPath = getenv("RCCL_ROCR_PATH");
    if(scclCudaPath == NULL)
        snprintf(path, 1024, "%s", "libhsa-runtime64.so");
    else
        snprintf(path, 1024, "%s%s", scclCudaPath, "libhsa-runtime64.so");

    hsaLib = dlopen(path, RTLD_LAZY);
    if(hsaLib == NULL) {
        WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", scclCudaPath, scclCudaPath);
        goto error;
    }

    /*
     * Load initial ROCr functions
     */

    pfn_hsa_init = (PFN_hsa_init)dlsym(hsaLib, "hsa_init");
    if(pfn_hsa_init == NULL) {
        WARN("Failed to load ROCr missing symbol hsa_init");
        goto error;
    }
    pfn_hsa_init();

    pfn_hsa_system_get_info = (PFN_hsa_system_get_info)dlsym(hsaLib, "hsa_system_get_info");
    if(pfn_hsa_system_get_info == NULL) {
        WARN("Failed to load ROCr missing symbol hsa_system_get_info");
        goto error;
    }

    pfn_hsa_status_string = (PFN_hsa_status_string)dlsym(hsaLib, "hsa_status_string");
    if(pfn_hsa_status_string == NULL) {
        WARN("Failed to load ROCr missing symbol hsa_status_string");
        goto error;
    }

    res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
    if(res != 0) {
        WARN("pfn_hsa_system_get_info failed with %d", res);
        goto error;
    }
    res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
    if(res != 0) {
        WARN("pfn_hsa_system_get_info failed with %d", res);
        goto error;
    }

    INFO(SCCL_LOG_NET, "ROCr version %d.%d", version_major, version_minor);

    // if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
    //  WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
    //  Silently ignore version check mismatch for backwards compatibility
    // goto error;
    //}

    /* DMA-BUF support */
    // ROCm support
    if(scclParamDmaBufEnable() == 0) {
        INFO(SCCL_LOG_NET, "Dmabuf feature disabled without SCCL_ENABLE_DMABUF_SUPPORT=1");
        goto error;
    }
    res = pfn_hsa_system_get_info((hsa_system_info_t)0x204, &dmaBufSupport);
    if(res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
        INFO(SCCL_LOG_NET, "Current version of ROCm does not support dmabuf feature.");
        goto error;
    } else {
        pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf)dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
        if(pfn_hsa_amd_portable_export_dmabuf == NULL) {
            WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
            goto error;
        } else {
            // check OS kernel support
            struct utsname utsname;
            FILE* fp             = NULL;
            char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
            char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
            char kernel_conf_file[128];
            char buf[256];
            int found_opt1 = 0;
            int found_opt2 = 0;

            // check for kernel name exists
            if(uname(&utsname) == -1)
                INFO(SCCL_LOG_NET, "Could not get kernel name");
            // format and store the kernel conf file location
            snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
            fp = fopen(kernel_conf_file, "r");
            if(fp == NULL)
                INFO(SCCL_LOG_NET, "Could not open kernel conf file");
            // look for kernel_opt1 and kernel_opt2 in the conf file and check
            while(fgets(buf, sizeof(buf), fp) != NULL) {
                if(strstr(buf, kernel_opt1) != NULL) {
                    found_opt1 = 1;
                    INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
                }
                if(strstr(buf, kernel_opt2) != NULL) {
                    found_opt2 = 1;
                    INFO(SCCL_LOG_NET, "CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
                }
            }
            if(!found_opt1 || !found_opt2) {
                dmaBufSupport = 0;
                INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
                INFO(SCCL_LOG_NET, "DMA_BUF_SUPPORT Failed due to OS kernel support");
            }

            if(dmaBufSupport)
                INFO(SCCL_LOG_NET, "DMA_BUF Support Enabled");
            else
                goto error;
        }
    }

    /*
     * Required to initialize the ROCr Driver.
     * Multiple calls of hsa_init() will return immediately
     * without making any relevant change
     */

    initResult = scclSuccess;

error:
    initResult = scclSystemError;
    return;
}

} // namespace rocm_wrap

scclResult_t rocmLibraryInit() {
    pthread_once(&rocm_wrap::initOnceControl, rocm_wrap::initOnceFunc);
    return rocm_wrap::initResult;
}

} // namespace net
} // namespace hardware
} // namespace sccl