f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
)
ifself.bits==2:
self.autogptq_cuda.vecquant2matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
elifself.bits==3:
self.autogptq_cuda.vecquant3matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
elifself.bits==4:
self.autogptq_cuda.vecquant4matmul_faster_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
self.half_indim,
)
else:
raiseNotImplementedError("Only 2,3,4 bits are supported.")
else:
x=x.to(torch.float32)# This is required for autocast compatibility.
ifself.bits==2:
self.autogptq_cuda.vecquant2matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elifself.bits==3:
self.autogptq_cuda.vecquant3matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elifself.bits==4:
self.autogptq_cuda.vecquant4matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
elifself.bits==8:
self.autogptq_cuda.vecquant8matmul_old(
x,
self.qweight,
out,
self.scales.float(),
self.qzeros,
self.group_size,
)
else:
raiseNotImplementedError("Only 2,3,4,8 bits are supported.")
raiseNotImplementedError("Only 4 bits are supported.")
qzeros=qzeros.astype(np.int32)
self.qzeros=torch.from_numpy(qzeros)
defforward(self,x):
ifx.dtype!=torch.float16:
logger.warning_once(
f"The exllama kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
f"The exllama v2 kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
raiseValueError("Can not use Marlin int4*fp16 kernel with AMD ROCm version of PyTorch as the kernel is not compatible. Please do not use `use_marlin=True` when using ROCm devices.")
ifnottorch.cuda.get_device_capability()[0]>=8:
raiseValueError(f'Can not use Marlin int4*fp16 kernel with a device of compute capability {torch.cuda.get_device_capability()}, the minimum compute capability is 8.0 for Marlin kernel. Please do not use `use_marlin=True`, or please upgrade your GPU ("The more you buy, the more you save." - Taiwanese proverb).')
ifinfeatures%128!=0oroutfeatures%256!=0:
raiseValueError("`infeatures` must be divisible by 128 and `outfeatures` by 256.")
ifbitsnotin[4]:
raiseNotImplementedError("Only 4 bits are supported.")