Unverified Commit f0ec93d0 authored by Tim Dettmers's avatar Tim Dettmers Committed by GitHub
Browse files

Merge pull request #76 from tomaarsen/cleanup

Cleanup involving a handful of failures, some optimization and a lot of code quality improvements
parents c059bd28 c91f592a
......@@ -3,6 +3,7 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import cuda_setup, utils
from .autograd._functions import (
MatmulLtState,
bmm_cublas,
......@@ -12,7 +13,6 @@ from .autograd._functions import (
)
from .cextension import COMPILED_WITH_CUDA
from .nn import modules
from . import cuda_setup, utils
if COMPILED_WITH_CUDA:
from .optim import adam
......
# from bitsandbytes.debug_cli import cli
# cli()
import os
import sys
from warnings import warn
......@@ -31,8 +28,8 @@ print()
from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
from .cuda_setup.env_vars import to_be_ignored
from .cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
print_header("POTENTIALLY LIBRARY-PATH-LIKE ENV VARS")
for k, v in os.environ.items():
......
import operator
import warnings
from dataclasses import dataclass
from functools import reduce # Required in Python 3
import torch
import bitsandbytes.functional as F
from dataclasses import dataclass
from functools import reduce # Required in Python 3
# math.prod not compatible with python < 3.8
def prod(iterable):
......@@ -18,7 +19,7 @@ tensor = torch.Tensor
This is particularly important for small models where outlier features
are less systematic and occur with low frequency.
"""
class GlobalOutlierPooler(object):
class GlobalOutlierPooler:
_instance = None
def __init__(self):
......@@ -49,8 +50,9 @@ class GlobalOutlierPooler(object):
class MatMul8bit(torch.autograd.Function):
@staticmethod
def forward(ctx, A, B, out=None, quant_type="vector", precision=[8, 8, 8]):
def forward(ctx, A, B, out=None, quant_type="vector", precision=None):
if precision is None:
precision = [8, 8, 8]
if precision[0] != 8:
with torch.no_grad():
output = torch.matmul(A, B)
......
import ctypes as ct
import torch
from pathlib import Path
from warnings import warn
import torch
class CUDASetup(object):
class CUDASetup:
_instance = None
def __init__(self):
......@@ -122,7 +122,7 @@ try:
CUDASetup.get_instance().generate_instructions()
CUDASetup.get_instance().print_log_stack()
raise RuntimeError('''
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs aboveto fix your environment!
CUDA Setup failed despite GPU being available. Inspect the CUDA SETUP outputs above to fix your environment!
If you cannot find any issues and suspect a bug, please open an issue with detals about your environment:
https://github.com/TimDettmers/bitsandbytes/issues''')
lib.cadam32bit_g32
......
from .paths import CUDA_RUNTIME_LIB, extract_candidate_paths, determine_cuda_runtime_lib_path
from .main import evaluate_cuda_setup
from .paths import (
CUDA_RUNTIME_LIB,
determine_cuda_runtime_lib_path,
extract_candidate_paths,
)
......@@ -17,11 +17,13 @@ evaluation:
"""
import ctypes
import torch
from .paths import determine_cuda_runtime_lib_path
from bitsandbytes.cextension import CUDASetup
from .paths import determine_cuda_runtime_lib_path
def check_cuda_result(cuda, result_val):
# 3. Check for CUDA errors
......@@ -48,7 +50,7 @@ def get_cuda_version(cuda, cudart_path):
minor = (version-(major*1000))//10
if major < 11:
CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currenlty not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
return f'{major}{minor}'
......@@ -129,7 +131,7 @@ def evaluate_cuda_setup():
failure = True
cuda_setup.add_log_entry("WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!", is_warning=True)
else:
cuda_setup.add_log_entry((f"CUDA SETUP: CUDA runtime path found: {cudart_path}"))
cuda_setup.add_log_entry(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
if cc == '' or cc is None:
failure = True
......
import errno
from pathlib import Path
from typing import Set, Union
from bitsandbytes.cextension import CUDASetup
from .env_vars import get_potentially_lib_path_containing_env_vars
......
import typer
cli = typer.Typer()
@cli.callback()
def callback():
"""
Awesome Portal Gun
"""
@cli.command()
def shoot():
"""
Shoot the portal gun
"""
typer.echo("Shooting portal gun")
@cli.command()
def load():
"""
Load the portal gun
"""
typer.echo("Loading portal gun")
......@@ -3,17 +3,19 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import ctypes as ct
import itertools
import operator
import random
import torch
import itertools
import math
from functools import reduce # Required in Python 3
from typing import Tuple
from torch import Tensor
from .cextension import COMPILED_WITH_CUDA, lib
from functools import reduce # Required in Python 3
# math.prod not compatible with python < 3.8
def prod(iterable):
......@@ -84,7 +86,7 @@ if COMPILED_WITH_CUDA:
)
class CUBLAS_Context(object):
class CUBLAS_Context:
_instance = None
def __init__(self):
......@@ -114,7 +116,7 @@ class CUBLAS_Context(object):
return self.context[device.index]
class Cusparse_Context(object):
class Cusparse_Context:
_instance = None
def __init__(self):
......@@ -264,12 +266,11 @@ def create_quantile_map(A, total_bits=8):
def get_special_format_str():
if not torch.cuda.is_available(): return 'col_turing'
major, minor = torch.cuda.get_device_capability()
major, _minor = torch.cuda.get_device_capability()
if major <= 7:
return "col_turing"
elif major == 8:
if major == 8:
return "col_ampere"
else:
return "col_turing"
......@@ -397,8 +398,6 @@ def nvidia_transform(
dim2 = ct.c_int32(shape[2])
ptr = CUBLAS_Context.get_instance().get_context(A.device)
ptrA = get_ptr(A)
ptrOut = get_ptr(out)
func(ptr, get_ptr(A), get_ptr(out), dim1, dim2)
return out, new_state
......@@ -1053,7 +1052,7 @@ def histogram_scatter_add_2d(
maxdim1 = ct.c_int32(histogram.shape[0])
n = ct.c_int32(index1.numel())
is_on_gpu([histogram, index1, index2d, source])
is_on_gpu([histogram, index1, index2, source])
lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
......@@ -1512,7 +1511,7 @@ def get_colrow_absmax(
return row_stats, col_stats, nnz_block_ptr
class COOSparseTensor(object):
class COOSparseTensor:
def __init__(self, rows, cols, nnz, rowidx, colidx, values):
assert rowidx.dtype == torch.int32
assert colidx.dtype == torch.int32
......@@ -1529,7 +1528,7 @@ class COOSparseTensor(object):
self.values = values
class CSRSparseTensor(object):
class CSRSparseTensor:
def __init__(self, rows, cols, nnz, rowptr, colidx, values):
assert rowptr.dtype == torch.int32
assert colidx.dtype == torch.int32
......@@ -1546,7 +1545,7 @@ class CSRSparseTensor(object):
self.values = values
class CSCSparseTensor(object):
class CSCSparseTensor:
def __init__(self, rows, cols, nnz, colptr, rowidx, values):
assert colptr.dtype == torch.int32
assert rowidx.dtype == torch.int32
......@@ -1710,8 +1709,6 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No
dim1 = ct.c_int32(shape[0] * shape[1])
dim2 = ct.c_int32(shape[2])
ptrA = get_ptr(A)
ptrOut = get_ptr(out)
is_on_gpu([A, out])
if to_order == 'col32':
if transpose:
......
......@@ -2,24 +2,11 @@
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import (
Any,
Callable,
Dict,
Iterator,
Mapping,
Optional,
Set,
Tuple,
TypeVar,
Union,
overload,
)
from typing import Optional, TypeVar, Union, overload
import torch
import torch.nn.functional as F
from torch import Tensor, device, dtype, nn
from torch.nn.parameter import Parameter
import bitsandbytes as bnb
from bitsandbytes.optim import GlobalOptimManager
......@@ -39,7 +26,7 @@ class StableEmbedding(torch.nn.Embedding):
sparse: bool = False,
_weight: Optional[Tensor] = None,
) -> None:
super(StableEmbedding, self).__init__(
super().__init__(
num_embeddings,
embedding_dim,
padding_idx,
......@@ -96,7 +83,7 @@ class Embedding(torch.nn.Embedding):
sparse: bool = False,
_weight: Optional[Tensor] = None,
) -> None:
super(Embedding, self).__init__(
super().__init__(
num_embeddings,
embedding_dim,
padding_idx,
......@@ -225,7 +212,7 @@ class Linear8bitLt(nn.Linear):
threshold=0.0,
index=None,
):
super(Linear8bitLt, self).__init__(
super().__init__(
input_features, output_features, bias
)
self.state = bnb.MatmulLtState()
......
......@@ -5,12 +5,11 @@
from bitsandbytes.cextension import COMPILED_WITH_CUDA
from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit
from .adam import Adam, Adam8bit, Adam32bit
from .adamw import AdamW, AdamW8bit, AdamW32bit
from .sgd import SGD, SGD8bit, SGD32bit
from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS
from .lamb import LAMB, LAMB8bit, LAMB32bit
from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit
from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit
from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS
from .optimizer import GlobalOptimManager
from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit
from .sgd import SGD, SGD8bit, SGD32bit
......@@ -21,18 +21,18 @@ class Adagrad(Optimizer1State):
block_wise=True,
):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= weight_decay:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay)
f"Invalid weight_decay value: {weight_decay}"
)
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
raise ValueError(f"Invalid epsilon value: {eps}")
if initial_accumulator_value != 0.0:
raise ValueError("Initial accumulator value != 0.0 not supported!")
if lr_decay != 0.0:
raise ValueError("Lr Decay != 0.0 not supported!")
super(Adagrad, self).__init__(
super().__init__(
"adagrad",
params,
lr,
......@@ -63,19 +63,19 @@ class Adagrad8bit(Optimizer1State):
block_wise=True,
):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= weight_decay:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay)
f"Invalid weight_decay value: {weight_decay}"
)
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
raise ValueError(f"Invalid epsilon value: {eps}")
if initial_accumulator_value != 0.0:
raise ValueError("Initial accumulator value != 0.0 not supported!")
if lr_decay != 0.0:
raise ValueError("Lr Decay != 0.0 not supported!")
assert block_wise
super(Adagrad8bit, self).__init__(
super().__init__(
"adagrad",
params,
lr,
......@@ -106,18 +106,18 @@ class Adagrad32bit(Optimizer1State):
block_wise=True,
):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
raise ValueError(f"Invalid learning rate: {lr}")
if not 0.0 <= weight_decay:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay)
f"Invalid weight_decay value: {weight_decay}"
)
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
raise ValueError(f"Invalid epsilon value: {eps}")
if initial_accumulator_value != 0.0:
raise ValueError("Initial accumulator value != 0.0 not supported!")
if lr_decay != 0.0:
raise ValueError("Lr Decay != 0.0 not supported!")
super(Adagrad32bit, self).__init__(
super().__init__(
"adagrad",
params,
lr,
......
......@@ -28,7 +28,7 @@ class Adam(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(Adam, self).__init__(
super().__init__(
"adam",
params,
lr,
......@@ -57,7 +57,7 @@ class Adam8bit(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(Adam8bit, self).__init__(
super().__init__(
"adam",
params,
lr,
......@@ -86,7 +86,7 @@ class Adam32bit(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(Adam32bit, self).__init__(
super().__init__(
"adam",
params,
lr,
......@@ -146,7 +146,7 @@ class AnalysisAdam(torch.optim.Optimizer):
weight_decay=weight_decay,
amsgrad=amsgrad,
)
super(AnalysisAdam, self).__init__(params, defaults)
super().__init__(params, defaults)
self.analysis = bnb_analysis
self.savedir = savedir
......
......@@ -20,7 +20,7 @@ class AdamW(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(AdamW, self).__init__(
super().__init__(
"adam",
params,
lr,
......@@ -49,7 +49,7 @@ class AdamW8bit(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(AdamW8bit, self).__init__(
super().__init__(
"adam",
params,
lr,
......@@ -78,7 +78,7 @@ class AdamW32bit(Optimizer2State):
percentile_clipping=100,
block_wise=True,
):
super(AdamW32bit, self).__init__(
super().__init__(
"adam",
params,
lr,
......
......@@ -23,7 +23,7 @@ class LAMB(Optimizer2State):
block_wise=False,
max_unorm=1.0,
):
super(LAMB, self).__init__(
super().__init__(
"lamb",
params,
lr,
......@@ -56,7 +56,7 @@ class LAMB8bit(Optimizer2State):
block_wise=False,
max_unorm=1.0,
):
super(LAMB8bit, self).__init__(
super().__init__(
"lamb",
params,
lr,
......@@ -89,7 +89,7 @@ class LAMB32bit(Optimizer2State):
block_wise=False,
max_unorm=1.0,
):
super(LAMB32bit, self).__init__(
super().__init__(
"lamb",
params,
lr,
......
......@@ -25,9 +25,9 @@ class LARS(Optimizer1State):
):
if momentum == 0:
raise NotImplementedError(
f"LARS without momentum is not supported!"
"LARS without momentum is not supported!"
)
super(LARS, self).__init__(
super().__init__(
"lars",
params,
lr,
......@@ -59,9 +59,9 @@ class LARS8bit(Optimizer1State):
):
if momentum == 0:
raise NotImplementedError(
f"LARS without momentum is not supported!"
"LARS without momentum is not supported!"
)
super(LARS8bit, self).__init__(
super().__init__(
"lars",
params,
lr,
......@@ -93,9 +93,9 @@ class LARS32bit(Optimizer1State):
):
if momentum == 0:
raise NotImplementedError(
f"LARS without momentum is not supported!"
"LARS without momentum is not supported!"
)
super(LARS32bit, self).__init__(
super().__init__(
"lars",
params,
lr,
......@@ -123,12 +123,12 @@ class PytorchLARS(Optimizer):
max_unorm=0.02,
):
if lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
raise ValueError(f"Invalid learning rate: {lr}")
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
raise ValueError(f"Invalid momentum value: {momentum}")
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay)
f"Invalid weight_decay value: {weight_decay}"
)
defaults = dict(
......@@ -143,10 +143,10 @@ class PytorchLARS(Optimizer):
raise ValueError(
"Nesterov momentum requires a momentum and zero dampening"
)
super(PytorchLARS, self).__init__(params, defaults)
super().__init__(params, defaults)
def __setstate__(self, state):
super(PytorchLARS, self).__setstate__(state)
super().__setstate__(state)
for group in self.param_groups:
group.setdefault("nesterov", False)
......@@ -181,7 +181,7 @@ class PytorchLARS(Optimizer):
state = self.state[p]
d_p = p.grad
if weight_decay != 0:
d_p = d_p.add(param, alpha=weight_decay)
d_p = d_p.add(p, alpha=weight_decay)
if momentum != 0:
buf = state.get("momentum_buffer", None)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment