# first plot the time occupied by different operations
# first plot the time occupied by different operations
fork,marker,ls,color,namein[
fork,marker,ls,color,namein[
('standard_gx+standard_gw+standard_fwd','s','-','C2','Standard fp16 (sum of parts)'),
("standard_gx+standard_gw+standard_fwd","s","-","C2","Standard fp16 (sum of parts)"),
('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd','o','-','C4','SwitchBack int8 (sum of parts)'),
warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
warn(
f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
warnings.warn(
warnings.filterwarnings('ignore',message='.*inference or training')
"Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.",
)
warnings.filterwarnings("ignore",message=".*inference or training")