main.py 17.2 KB
Newer Older
1
"""
2
extract factors the build is dependent on:
3
[X] compute capability
4
    [ ] TODO: Q - What if we have multiple GPUs of different makes?
5
6
7
8
9
10
11
12
13
14
15
16
17
18
- CUDA version
- Software:
    - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
    - CuBLAS-LT: full-build 8-bit optimizer
    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)

evaluation:
    - if paths faulty, return meaningful error
    - else:
        - determine CUDA version
        - determine capabilities
        - based on that set the default path
"""

19
import ctypes as ct
20
import os
21
import errno
22
import torch
23

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from pathlib import Path
from typing import Set, Union
from .env_vars import get_potentially_lib_path_containing_env_vars

CUDA_RUNTIME_LIB: str = "libcudart.so"

class CUDASetup:
    _instance = None

    def __init__(self):
        raise RuntimeError("Call get_instance() instead")

    def generate_instructions(self):
        if self.cuda is None:
            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected.')
            self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.')
            self.add_log_entry('CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:')
            self.add_log_entry('CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null')
            self.add_log_entry('CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a')
            self.add_log_entry('CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc')
            return

        if self.cudart_path is None:
            self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.')
            self.add_log_entry('CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable')
            self.add_log_entry('CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null')
            self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
            self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
            self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://github.com/TimDettmers/bitsandbytes/blob/main/cuda_install.sh')
            self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
            self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
            return

        make_cmd = f'CUDA_VERSION={self.cuda_version_string}'
        if len(self.cuda_version_string) < 3:
            make_cmd += ' make cuda92'
        elif self.cuda_version_string == '110':
            make_cmd += ' make cuda110'
        elif self.cuda_version_string[:2] == '11' and int(self.cuda_version_string[2]) > 0:
            make_cmd += ' make cuda11x'
65
66
67
68
69
        elif self.cuda_version_string == '100':
            self.add_log_entry('CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.')
            self.add_log_entry('CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables.')
            return

70

71
        has_cublaslt = is_cublasLt_compatible(self.cc)
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
        if not has_cublaslt:
            make_cmd += '_nomatmul'

        self.add_log_entry('CUDA SETUP: Something unexpected happened. Please compile from source:')
        self.add_log_entry('git clone git@github.com:TimDettmers/bitsandbytes.git')
        self.add_log_entry('cd bitsandbytes')
        self.add_log_entry(make_cmd)
        self.add_log_entry('python setup.py install')

    def initialize(self):
        self.has_printed = False
        self.lib = None
        self.initialized = False

    def run_cuda_setup(self):
        self.initialized = True
        self.cuda_setup_log = []

        binary_name, cudart_path, cuda, cc, cuda_version_string = evaluate_cuda_setup()
        self.cudart_path = cudart_path
        self.cuda = cuda
        self.cc = cc
        self.cuda_version_string = cuda_version_string

        package_dir = Path(__file__).parent.parent
        binary_path = package_dir / binary_name

        try:
            if not binary_path.exists():
                self.add_log_entry(f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?")
102
                legacy_binary_name = "libbitsandbytes_cpu.so"
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
                self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
                binary_path = package_dir / legacy_binary_name
                if not binary_path.exists():
                    self.add_log_entry('')
                    self.add_log_entry('='*48 + 'ERROR' + '='*37)
                    self.add_log_entry('CUDA SETUP: CUDA detection failed! Possible reasons:')
                    self.add_log_entry('1. CUDA driver not installed')
                    self.add_log_entry('2. CUDA not installed')
                    self.add_log_entry('3. You have multiple conflicting CUDA libraries')
                    self.add_log_entry('4. Required library not pre-compiled for this bitsandbytes release!')
                    self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
                    self.add_log_entry('='*80)
                    self.add_log_entry('')
                    self.generate_instructions()
                    self.print_log_stack()
                    raise Exception('CUDA SETUP: Setup Failed!')
                self.lib = ct.cdll.LoadLibrary(binary_path)
            else:
                self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
                self.lib = ct.cdll.LoadLibrary(binary_path)
        except Exception as ex:
            self.add_log_entry(str(ex))
            self.print_log_stack()

    def add_log_entry(self, msg, is_warning=False):
        self.cuda_setup_log.append((msg, is_warning))

    def print_log_stack(self):
        for msg, is_warning in self.cuda_setup_log:
            if is_warning:
                warn(msg)
            else:
                print(msg)

    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = cls.__new__(cls)
            cls._instance.initialize()
        return cls._instance


145
146
147
148
149
150
151
152
153
def is_cublasLt_compatible(cc):
    has_cublaslt = False
    if cc is not None:
        cc_major, cc_minor = cc.split('.')
        if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
            cuda_setup.add_log_entry("WARNING: Compute capability < 7.5 detected! Proceeding to load CPU-only library...", is_warning=True)
        else:
            has_cublaslt = True
    return has_cublaslt
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260

def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
    return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path}


def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
    existent_directories: Set[Path] = set()
    for path in candidate_paths:
        try:
            if path.exists():
                existent_directories.add(path)
        except OSError as exc:
            if exc.errno != errno.ENAMETOOLONG:
                raise exc

    non_existent_directories: Set[Path] = candidate_paths - existent_directories
    if non_existent_directories:
        CUDASetup.get_instance().add_log_entry("WARNING: The following directories listed in your path were found to "
            f"be non-existent: {non_existent_directories}", is_warning=True)

    return existent_directories


def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
    return {
        path / CUDA_RUNTIME_LIB
        for path in candidate_paths
        if (path / CUDA_RUNTIME_LIB).is_file()
    }


def resolve_paths_list(paths_list_candidate: str) -> Set[Path]:
    """
    Searches a given environmental var for the CUDA runtime library,
    i.e. `libcudart.so`.
    """
    return remove_non_existent_dirs(extract_candidate_paths(paths_list_candidate))


def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]:
    return get_cuda_runtime_lib_paths(
        resolve_paths_list(paths_list_candidate)
    )


def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
    if len(results_paths) > 1:
        warning_msg = (
            f"Found duplicate {CUDA_RUNTIME_LIB} files: {results_paths}.. "
            "We'll flip a coin and try one of these, in order to fail forward.\n"
            "Either way, this might cause trouble in the future:\n"
            "If you get `CUDA error: invalid device function` errors, the above "
            "might be the cause and the solution is to make sure only one "
            f"{CUDA_RUNTIME_LIB} in the paths that we search based on your env.")
        CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)


def determine_cuda_runtime_lib_path() -> Union[Path, None]:
    """
        Searches for a cuda installations, in the following order of priority:
            1. active conda env
            2. LD_LIBRARY_PATH
            3. any other env vars, while ignoring those that
                - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
                - don't contain the path separator `/`

        If multiple libraries are found in part 3, we optimistically try one,
        while giving a warning message.
    """
    candidate_env_vars = get_potentially_lib_path_containing_env_vars()

    if "CONDA_PREFIX" in candidate_env_vars:
        conda_libs_path = Path(candidate_env_vars["CONDA_PREFIX"]) / "lib"

        conda_cuda_libs = find_cuda_lib_in(str(conda_libs_path))
        warn_in_case_of_duplicates(conda_cuda_libs)

        if conda_cuda_libs:
            return next(iter(conda_cuda_libs))

        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
            f'{CUDA_RUNTIME_LIB} as expected! Searching further paths...', is_warning=True)

    if "LD_LIBRARY_PATH" in candidate_env_vars:
        lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"])

        if lib_ld_cuda_libs:
            return next(iter(lib_ld_cuda_libs))
        warn_in_case_of_duplicates(lib_ld_cuda_libs)

        CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
            f'{CUDA_RUNTIME_LIB} as expected! Searching further paths...', is_warning=True)

    remaining_candidate_env_vars = {
        env_var: value for env_var, value in candidate_env_vars.items()
        if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"}
    }

    cuda_runtime_libs = set()
    for env_var, value in remaining_candidate_env_vars.items():
        cuda_runtime_libs.update(find_cuda_lib_in(value))

    if len(cuda_runtime_libs) == 0:
        CUDASetup.get_instance().add_log_entry('CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64...')
        cuda_runtime_libs.update(find_cuda_lib_in('/usr/local/cuda/lib64'))

    warn_in_case_of_duplicates(cuda_runtime_libs)
261

262
    return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None
Tom Aarsen's avatar
Tom Aarsen committed
263

264
265

def check_cuda_result(cuda, result_val):
266
    # 3. Check for CUDA errors
267
    if result_val != 0:
268
269
        error_str = ct.c_char_p()
        cuda.cuGetErrorString(result_val, ct.byref(error_str))
270
        CUDASetup.get_instance().add_log_entry(f"CUDA exception! Error code: {error_str.value.decode()}")
271

272
273

# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
274
def get_cuda_version(cuda, cudart_path):
275
276
    if cuda is None: return None

277
    try:
278
        cudart = ct.CDLL(cudart_path)
279
    except OSError:
280
        CUDASetup.get_instance().add_log_entry(f'ERROR: libcudart.so could not be read from path: {cudart_path}!')
281
282
        return None

283
284
    version = ct.c_int()
    check_cuda_result(cuda, cudart.cudaRuntimeGetVersion(ct.byref(version)))
285
286
287
288
    version = int(version.value)
    major = version//1000
    minor = (version-(major*1000))//10

289
    if major < 11:
290
        CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
291

292
293
294
295
296
297
    return f'{major}{minor}'


def get_cuda_lib_handle():
    # 1. find libcuda.so library (GPU driver) (/usr/lib)
    try:
298
        cuda = ct.CDLL("libcuda.so")
299
    except OSError:
300
        CUDASetup.get_instance().add_log_entry('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
301
302
        return None
    check_cuda_result(cuda, cuda.cuInit(0))
303

304
305
306
307
    return cuda


def get_compute_capabilities(cuda):
308
309
310
311
312
313
314
315
316
    """
    1. find libcuda.so library (GPU driver) (/usr/lib)
       init_device -> init variables -> call function by reference
    2. call extern C function to determine CC
       (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html)
    3. Check for CUDA errors
       https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
    # bits taken from https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549
    """
317

318
319
320
    nGpus = ct.c_int()
    cc_major = ct.c_int()
    cc_minor = ct.c_int()
321

322
    device = ct.c_int()
323

324
    check_cuda_result(cuda, cuda.cuDeviceGetCount(ct.byref(nGpus)))
325
326
    ccs = []
    for i in range(nGpus.value):
327
328
329
        check_cuda_result(cuda, cuda.cuDeviceGet(ct.byref(device), i))
        ref_major = ct.byref(cc_major)
        ref_minor = ct.byref(cc_minor)
330
        # 2. call extern C function to determine CC
331
        check_cuda_result(cuda, cuda.cuDeviceComputeCapability(ref_major, ref_minor, device))
332
        ccs.append(f"{cc_major.value}.{cc_minor.value}")
333

Tim Dettmers's avatar
Tim Dettmers committed
334
    return ccs
335
336


337
# def get_compute_capability()-> Union[List[str, ...], None]: # FIXME: error
338
def get_compute_capability(cuda):
339
340
341
342
343
    """
    Extracts the highest compute capbility from all available GPUs, as compute
    capabilities are downwards compatible. If no GPUs are detected, it returns
    None.
    """
344
345
346
    if cuda is None: return None

    # TODO: handle different compute capabilities; for now, take the max
347
    ccs = get_compute_capabilities(cuda)
348
    if ccs: return ccs[-1]
349

350

351
def evaluate_cuda_setup():
352
353
    if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0':
        print('')
Tim Dettmers's avatar
Tim Dettmers committed
354
        print('='*35 + 'BUG REPORT' + '='*35)
355
356
        print('Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')
        print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
Tim Dettmers's avatar
Tim Dettmers committed
357
        print('='*80)
358
    if not torch.cuda.is_available(): return 'libsbitsandbytes_cpu.so', None, None, None, None
359
360

    cuda_setup = CUDASetup.get_instance()
361
362
363
364
365
    cudart_path = determine_cuda_runtime_lib_path()
    cuda = get_cuda_lib_handle()
    cc = get_compute_capability(cuda)
    cuda_version_string = get_cuda_version(cuda, cudart_path)

366
367
368
369
370
    failure = False
    if cudart_path is None:
        failure = True
        cuda_setup.add_log_entry("WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!", is_warning=True)
    else:
Tom Aarsen's avatar
Tom Aarsen committed
371
        cuda_setup.add_log_entry(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
372

373
    if cc == '' or cc is None:
374
375
376
377
        failure = True
        cuda_setup.add_log_entry("WARNING: No GPU detected! Check your CUDA paths. Proceeding to load CPU-only library...", is_warning=True)
    else:
        cuda_setup.add_log_entry(f"CUDA SETUP: Highest compute capability among GPUs detected: {cc}")
378

379
380
381
382
    if cuda is None:
        failure = True
    else:
        cuda_setup.add_log_entry(f'CUDA SETUP: Detected CUDA version {cuda_version_string}')
383

384
    # 7.5 is the minimum CC vor cublaslt
385
    has_cublaslt = is_cublasLt_compatible(cc)
386

387
    # TODO:
388
    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
389
390
    # (2) Multiple CUDA versions installed

391
392
    # we use ls -l instead of nvcc to determine the cuda version
    # since most installations will have the libcudart.so installed, but not the compiler
393

394
395
396
397
398
    if failure:
        binary_name = "libbitsandbytes_cpu.so"
    elif has_cublaslt:
        binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so"
    else:
399
        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
400
        binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
401

402
    return binary_name, cudart_path, cuda, cc, cuda_version_string