rocmwrap.cc 5.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/*************************************************************************
 * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "nccl.h"
#include "debug.h"
#include "rocmwrap.h"
#include "hsa/hsa.h"
#include "param.h"

#include <dlfcn.h>
#include <sys/utsname.h>
#include <fstream>

#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr

DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
DECLARE_ROCM_PFN(hsa_status_string);


static void *hsaLib;
static uint16_t version_major, version_minor;
bool ncclCudaLaunchBlocking = false;

static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static ncclResult_t initResult;

static void initOnceFunc() {
  do {
    char* val = getenv("CUDA_LAUNCH_BLOCKING");
    ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
  } while (0);

  bool dmaBufSupport = false;
  hsa_status_t res;

  /*
   * Load ROCr driver library
   */
  char path[1024];
  char *ncclCudaPath = getenv("RCCL_ROCR_PATH");
  if (ncclCudaPath == NULL)
    snprintf(path, 1024, "%s", "libhsa-runtime64.so");
  else
    snprintf(path, 1024, "%s%s", ncclCudaPath, "libhsa-runtime64.so");

  hsaLib = dlopen(path, RTLD_LAZY);
  if (hsaLib == NULL) {
    WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", ncclCudaPath, ncclCudaPath);
    goto error;
  }

  /*
   * Load initial ROCr functions
   */

  pfn_hsa_init = (PFN_hsa_init) dlsym(hsaLib, "hsa_init");
  if (pfn_hsa_init == NULL) {
    WARN("Failed to load ROCr missing symbol hsa_init");
    goto error;
  }
  pfn_hsa_init();

  pfn_hsa_system_get_info = (PFN_hsa_system_get_info) dlsym(hsaLib, "hsa_system_get_info");
  if (pfn_hsa_system_get_info == NULL) {
    WARN("Failed to load ROCr missing symbol hsa_system_get_info");
    goto error;
  }

  pfn_hsa_status_string = (PFN_hsa_status_string) dlsym(hsaLib, "hsa_status_string");
  if (pfn_hsa_status_string == NULL) {
    WARN("Failed to load ROCr missing symbol hsa_status_string");
    goto error;
  }

  res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
  if (res != 0) {
    WARN("pfn_hsa_system_get_info failed with %d", res);
    goto error;
  }
  res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
  if (res != 0) {
    WARN("pfn_hsa_system_get_info failed with %d", res);
    goto error;
  }

  INFO(NCCL_INIT, "ROCr version %d.%d", version_major, version_minor);

  //if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
    // WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
    // Silently ignore version check mismatch for backwards compatibility
    //goto error;
  //}

  /* DMA-BUF support */
  //ROCm support
  if (ncclParamDmaBufEnable() == 0 ) {
    INFO(NCCL_INIT, "Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1");
    goto error;
  }
  res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport);
  if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
    INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
    goto error;
  }
  else {
    pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
    if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
      WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
      goto error;
    }
    else {
      //check OS kernel support
      struct utsname utsname;
      FILE *fp = NULL;
      char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
      char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
      char kernel_conf_file[128];
      char buf[256];
      int found_opt1 = 0;
      int found_opt2 = 0;

      //check for kernel name exists
      if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
      //format and store the kernel conf file location
      snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
      fp = fopen(kernel_conf_file, "r");
      if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
      //look for kernel_opt1 and kernel_opt2 in the conf file and check
      while (fgets(buf, sizeof(buf), fp) != NULL) {
        if (strstr(buf, kernel_opt1) != NULL) {
          found_opt1 = 1;
          INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
        }
        if (strstr(buf, kernel_opt2) != NULL) {
          found_opt2 = 1;
          INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
        }
      }
      if (!found_opt1 || !found_opt2) {
        dmaBufSupport = 0;
        INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
        INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
      }

      if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
      else goto error;
    }
  }

  /*
   * Required to initialize the ROCr Driver.
   * Multiple calls of hsa_init() will return immediately
   * without making any relevant change
   */

  initResult = ncclSuccess;

error:
  initResult = ncclSystemError;
}

int ncclCuMemEnable() {
  return 0;
}

ncclResult_t rocmLibraryInit() {
  pthread_once(&initOnceControl, initOnceFunc);
  return initResult;
}