# first plot the time occupied by different operations
# first plot the time occupied by different operations
fork,marker,ls,color,namein[
fork,marker,ls,color,namein[
('standard_gx+standard_gw+standard_fwd','s','-','C2','Standard fp16 (sum of parts)'),
("standard_gx+standard_gw+standard_fwd","s","-","C2","Standard fp16 (sum of parts)"),
('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd','o','-','C4','SwitchBack int8 (sum of parts)'),
warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
warn(
f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
raiseTypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n{[(t.shape,t.device)fortintensors]}')
raiseTypeError(
f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n{[(t.shape,t.device)fortintensors]}",
)
iflen(gpu_ids)>1:
iflen(gpu_ids)>1:
raiseTypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n{[(t.shape,t.device)fortintensors]}')
raiseTypeError(
f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n{[(t.shape,t.device)fortintensors]}",
ifA.numel()<256:raiseNotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.')
ifA.numel()<256:
ifnum_quantiles>256:raiseNotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}")
raiseNotImplementedError(
ifnum_quantiles<256andoffset==1/(512):
f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.",
)
ifnum_quantiles>256:
raiseNotImplementedError(
f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}",
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
warnings.warn(
warnings.filterwarnings('ignore',message='.*inference or training')
"Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.",
)
warnings.filterwarnings("ignore",message=".*inference or training")