#include #include #include #include #include #include #include "base.h" #include "rocm_wrap.h" namespace sccl { namespace hardware { namespace net { namespace rocm_wrap { #define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support /* ROCr Driver functions loaded with dlsym() */ DECLARE_ROCM_PFN(hsa_init); DECLARE_ROCM_PFN(hsa_system_get_info); DECLARE_ROCM_PFN(hsa_status_string); SCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0); static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT; static scclResult_t initResult; static void* hsaLib; static uint16_t version_major, version_minor; bool scclCudaLaunchBlocking = false; ////////////////////////////////////////////////////////////////////////////// static void initOnceFunc() { do { char* val = getenv("CUDA_LAUNCH_BLOCKING"); scclCudaLaunchBlocking = val != nullptr && val[0] != 0 && !(val[0] == '0' && val[1] == 0); } while(0); bool dmaBufSupport = false; hsa_status_t res; /* * Load ROCr driver library */ char path[1024]; char* scclCudaPath = getenv("RCCL_ROCR_PATH"); if(scclCudaPath == NULL) snprintf(path, 1024, "%s", "libhsa-runtime64.so"); else snprintf(path, 1024, "%s%s", scclCudaPath, "libhsa-runtime64.so"); hsaLib = dlopen(path, RTLD_LAZY); if(hsaLib == NULL) { WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", scclCudaPath, scclCudaPath); goto error; } /* * Load initial ROCr functions */ pfn_hsa_init = (PFN_hsa_init)dlsym(hsaLib, "hsa_init"); if(pfn_hsa_init == NULL) { WARN("Failed to load ROCr missing symbol hsa_init"); goto error; } pfn_hsa_init(); pfn_hsa_system_get_info = (PFN_hsa_system_get_info)dlsym(hsaLib, "hsa_system_get_info"); if(pfn_hsa_system_get_info == NULL) { WARN("Failed to load ROCr missing symbol hsa_system_get_info"); goto error; } pfn_hsa_status_string = (PFN_hsa_status_string)dlsym(hsaLib, "hsa_status_string"); if(pfn_hsa_status_string == NULL) { WARN("Failed to load ROCr missing symbol hsa_status_string"); goto error; } res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major); if(res != 0) { WARN("pfn_hsa_system_get_info failed with %d", res); goto error; } res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor); if(res != 0) { WARN("pfn_hsa_system_get_info failed with %d", res); goto error; } INFO(SCCL_LOG_NET, "ROCr version %d.%d", version_major, version_minor); // if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) { // WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION); // Silently ignore version check mismatch for backwards compatibility // goto error; //} /* DMA-BUF support */ // ROCm support if(scclParamDmaBufEnable() == 0) { INFO(SCCL_LOG_NET, "Dmabuf feature disabled without SCCL_ENABLE_DMABUF_SUPPORT=1"); goto error; } res = pfn_hsa_system_get_info((hsa_system_info_t)0x204, &dmaBufSupport); if(res != HSA_STATUS_SUCCESS || !dmaBufSupport) { INFO(SCCL_LOG_NET, "Current version of ROCm does not support dmabuf feature."); goto error; } else { pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf)dlsym(hsaLib, "hsa_amd_portable_export_dmabuf"); if(pfn_hsa_amd_portable_export_dmabuf == NULL) { WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf"); goto error; } else { // check OS kernel support struct utsname utsname; FILE* fp = NULL; char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y"; char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y"; char kernel_conf_file[128]; char buf[256]; int found_opt1 = 0; int found_opt2 = 0; // check for kernel name exists if(uname(&utsname) == -1) INFO(SCCL_LOG_NET, "Could not get kernel name"); // format and store the kernel conf file location snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release); fp = fopen(kernel_conf_file, "r"); if(fp == NULL) INFO(SCCL_LOG_NET, "Could not open kernel conf file"); // look for kernel_opt1 and kernel_opt2 in the conf file and check while(fgets(buf, sizeof(buf), fp) != NULL) { if(strstr(buf, kernel_opt1) != NULL) { found_opt1 = 1; INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release); } if(strstr(buf, kernel_opt2) != NULL) { found_opt2 = 1; INFO(SCCL_LOG_NET, "CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release); } } if(!found_opt1 || !found_opt2) { dmaBufSupport = 0; INFO(SCCL_LOG_NET, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release); INFO(SCCL_LOG_NET, "DMA_BUF_SUPPORT Failed due to OS kernel support"); } if(dmaBufSupport) INFO(SCCL_LOG_NET, "DMA_BUF Support Enabled"); else goto error; } } /* * Required to initialize the ROCr Driver. * Multiple calls of hsa_init() will return immediately * without making any relevant change */ initResult = scclSuccess; error: initResult = scclSystemError; return; } } // namespace rocm_wrap scclResult_t rocmLibraryInit() { pthread_once(&rocm_wrap::initOnceControl, rocm_wrap::initOnceFunc); return rocm_wrap::initResult; } } // namespace net } // namespace hardware } // namespace sccl