warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
ifA.shape[-1]%quant_state.blocksize!=0:
warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
# self.persistent_buffers = [] # TODO consider as way to save quant state
self.compute_dtype=compute_dtype
self.compute_type_is_set=False
...
...
@@ -224,10 +235,28 @@ class Linear4bit(nn.Linear):
warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.')
warnings.filterwarnings('ignore',message='.*inference or training')