Unverified Commit 259ad441 authored by Aarni Koskela's avatar Aarni Koskela Committed by GitHub
Browse files

CUDA setup cleanup (#996)

* Diagnostics: streamline debug printing code

* CUDA setup: Remove unused `backup_paths`

* CUDA setup: DRY OS detection

* CUDA setup: Streamline `manual_override()`

* CUDA setup: Use comment instead of string literal, simplify

* CUDA setup: remove duplicate sort

The "sort compute capabilities" fix from #703 (#527) would actually do nothing due to this.

* CUDA setup: make version number replacement logic more obvious
parent acc7fb37
import glob
import os import os
from os.path import isdir
import sys import sys
from warnings import warn from warnings import warn
...@@ -8,17 +8,9 @@ import torch ...@@ -8,17 +8,9 @@ import torch
HEADER_WIDTH = 60 HEADER_WIDTH = 60
def find_file_recursive(folder, filename): def find_dynamic_library(folder, filename):
import glob for ext in ("so", "dll", "dylib"):
outs = [] yield from glob.glob(os.path.join(folder, "**", filename + ext))
try:
for ext in ["so", "dll", "dylib"]:
out = glob.glob(os.path.join(folder, "**", filename + ext))
outs.extend(out)
except Exception as e:
raise RuntimeError('Error: Something when wrong when trying to find file.') from e
return outs
def generate_bug_report_information(): def generate_bug_report_information():
...@@ -27,40 +19,25 @@ def generate_bug_report_information(): ...@@ -27,40 +19,25 @@ def generate_bug_report_information():
print_header("") print_header("")
print('') print('')
if 'CONDA_PREFIX' in os.environ: path_sources = [
paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*') ("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")),
print_header("ANACONDA CUDA PATHS") ("/usr/local CUDA PATHS", "/usr/local"),
print(paths) ("CUDA PATHS", os.environ.get("CUDA_PATH")),
print('') ("WORKING DIRECTORY CUDA PATHS", os.getcwd()),
if isdir('/usr/local/'): ]
paths = find_file_recursive('/usr/local', '*cuda*') try:
print_header("/usr/local CUDA PATHS") ld_library_path = os.environ.get("LD_LIBRARY_PATH")
print(paths) if ld_library_path:
print('') for path in set(ld_library_path.strip().split(os.pathsep)):
if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']): path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path))
paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*') except Exception as e:
print_header("CUDA PATHS") print(f"Could not parse LD_LIBRARY_PATH: {e}")
print(paths)
print('') for name, path in path_sources:
if path and os.path.isdir(path):
if isdir(os.getcwd()): print_header(name)
paths = find_file_recursive(os.getcwd(), '*cuda*') print(list(find_dynamic_library(path, '*cuda*')))
print_header("WORKING DIRECTORY CUDA PATHS") print("")
print(paths)
print('')
print_header("LD_LIBRARY CUDA PATHS")
if 'LD_LIBRARY_PATH' in os.environ:
lib_path = os.environ['LD_LIBRARY_PATH'].strip()
for path in set(lib_path.split(os.pathsep)):
try:
if isdir(path):
print_header(f"{path} CUDA PATHS")
paths = find_file_recursive(path, '*cuda*')
print(paths)
except Exception as e:
print(f'Could not read LD_LIBRARY_PATH: {path} ({e})')
print('')
def print_header( def print_header(
......
...@@ -28,19 +28,17 @@ import torch ...@@ -28,19 +28,17 @@ import torch
from .env_vars import get_potentially_lib_path_containing_env_vars from .env_vars import get_potentially_lib_path_containing_env_vars
# these are the most common libs names if platform.system() == 'Windows': # Windows
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
system = platform.system()
if system == 'Windows':
CUDA_RUNTIME_LIBS = ["nvcuda.dll"] CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
else: # Linux or other DYNAMIC_LIBRARY_SUFFIX = ".dll"
CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] else: # Linux or other
# these are the most common libs names
# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
# we have libcudart.so.11.0 which causes a lot of errors before
# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"]
DYNAMIC_LIBRARY_SUFFIX = ".so"
# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
backup_paths = []
backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0')
class CUDASetup: class CUDASetup:
_instance = None _instance = None
...@@ -108,22 +106,30 @@ class CUDASetup: ...@@ -108,22 +106,30 @@ class CUDASetup:
self.error = False self.error = False
def manual_override(self): def manual_override(self):
if torch.cuda.is_available(): if not torch.cuda.is_available():
if 'BNB_CUDA_VERSION' in os.environ: return
if len(os.environ['BNB_CUDA_VERSION']) > 0: override_value = os.environ.get('BNB_CUDA_VERSION')
warn( if not override_value:
f'\n\n{"=" * 80}\n' return
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".")
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`;
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' # let's remove any trailing numbers:
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n' binary_name_stem = binary_name_stem.rstrip("0123456789")
f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}' # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`;
f'\n{"=" * 80}\n\n' # let's tack the new version number and the original extension back on.
) self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}"
binary_name = self.binary_name.rsplit(".", 1)[0]
suffix = ".so" if os.name != "nt" else ".dll" warn(
self.binary_name = binary_name[:-3] + f'{os.environ["BNB_CUDA_VERSION"]}.{suffix}' f'\n\n{"=" * 80}\n'
'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
f'Loading: {self.binary_name}'
f'\n{"=" * 80}\n\n'
)
def run_cuda_setup(self): def run_cuda_setup(self):
self.initialized = True self.initialized = True
...@@ -140,11 +146,10 @@ class CUDASetup: ...@@ -140,11 +146,10 @@ class CUDASetup:
package_dir = Path(__file__).parent.parent package_dir = Path(__file__).parent.parent
binary_path = package_dir / self.binary_name binary_path = package_dir / self.binary_name
suffix = ".so" if os.name != "nt" else ".dll"
try: try:
if not binary_path.exists(): if not binary_path.exists():
self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?") self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
legacy_binary_name = f"libbitsandbytes_cpu{suffix}" legacy_binary_name = f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}"
self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...") self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
binary_path = package_dir / legacy_binary_name binary_path = package_dir / legacy_binary_name
if not binary_path.exists() or torch.cuda.is_available(): if not binary_path.exists() or torch.cuda.is_available():
...@@ -348,19 +353,18 @@ def get_compute_capabilities(): ...@@ -348,19 +353,18 @@ def get_compute_capabilities():
def evaluate_cuda_setup(): def evaluate_cuda_setup():
cuda_setup = CUDASetup.get_instance() cuda_setup = CUDASetup.get_instance()
suffix = ".so" if os.name != "nt" else ".dll"
if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
cuda_setup.add_log_entry('') cuda_setup.add_log_entry('')
cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35) cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35)
cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'), cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')) ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
cuda_setup.add_log_entry('='*80) cuda_setup.add_log_entry('='*80)
if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None
if not torch.cuda.is_available():
return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None
cudart_path = determine_cuda_runtime_lib_path() cudart_path = determine_cuda_runtime_lib_path()
ccs = get_compute_capabilities() cc = get_compute_capabilities()[-1] # we take the highest capability
ccs.sort()
cc = ccs[-1] # we take the highest capability
cuda_version_string = get_cuda_version() cuda_version_string = get_cuda_version()
cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.") cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.")
...@@ -380,12 +384,11 @@ def evaluate_cuda_setup(): ...@@ -380,12 +384,11 @@ def evaluate_cuda_setup():
# we use ls -l instead of nvcc to determine the cuda version # we use ls -l instead of nvcc to determine the cuda version
# since most installations will have the libcudart.so installed, but not the compiler # since most installations will have the libcudart.so installed, but not the compiler
if has_cublaslt: binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}" if not has_cublaslt:
else: # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt" binary_name += "_nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt"
binary_name = f"{binary_name}{suffix}" binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}"
return binary_name, cudart_path, cc, cuda_version_string return binary_name, cudart_path, cc, cuda_version_string
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment