Commit d76078ef authored by yangql's avatar yangql
Browse files

initing develop

parent a2630e0f
......@@ -756,39 +756,39 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
use_triton = False
use_tritonv2 = False
if not disable_exllama and not EXLLAMA_KERNELS_AVAILABLE:
logger.warning(
"Exllama kernel is not installed, reset disable_exllama to True. "
"This may because you installed auto_gptq using a pre-build wheel "
"on Windows, in which exllama_kernels are not compiled. To use "
"exllama_kernels to further speedup inference, you can re-install "
"auto_gptq from source."
)
# logger.warning(
# "Exllama kernel is not installed, reset disable_exllama to True. "
# "This may because you installed auto_gptq using a pre-build wheel "
# "on Windows, in which exllama_kernels are not compiled. To use "
# "exllama_kernels to further speedup inference, you can re-install "
# "auto_gptq from source."
# )
disable_exllama = True
if not disable_exllamav2 and not EXLLAMAV2_KERNELS_AVAILABLE:
logger.warning(
"Exllamav2 kernel is not installed, reset disable_exllamav2 to True. "
"This may because you installed auto_gptq using a pre-build wheel "
"on Windows, in which exllama_kernels are not compiled. To use "
"exllama_kernels to further speedup inference, you can re-install "
"auto_gptq from source."
)
disable_exllamav2 = True
if not AUTOGPTQ_CUDA_AVAILABLE:
logger.warning(
"CUDA kernels for auto_gptq are not installed, this will result in "
"very slow inference speed. This may because:\n"
"1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.\n"
"2. You are using pytorch without CUDA support.\n"
"3. CUDA and nvcc are not installed in your device."
)
if use_qigen and QIGEN_AVAILABLE:
logger.warning("QIgen is active. Ignores all settings related to cuda.")
inject_fused_attention = False
inject_fused_mlp = False
use_triton = False
disable_exllama = False
# logger.warning(
# "Exllamav2 kernel is not installed, reset disable_exllamav2 to True. "
# "This may because you installed auto_gptq using a pre-build wheel "
# "on Windows, in which exllama_kernels are not compiled. To use "
# "exllama_kernels to further speedup inference, you can re-install "
# "auto_gptq from source."
# )
disable_exllamav2 = True
# if not AUTOGPTQ_CUDA_AVAILABLE:
# logger.warning(
# "CUDA kernels for auto_gptq are not installed, this will result in "
# "very slow inference speed. This may because:\n"
# "1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.\n"
# "2. You are using pytorch without CUDA support.\n"
# "3. CUDA and nvcc are not installed in your device."
# )
# if use_qigen and QIGEN_AVAILABLE:
# logger.warning("QIgen is active. Ignores all settings related to cuda.")
# inject_fused_attention = False
# inject_fused_mlp = False
# use_triton = False
# disable_exllama = False
# disable_exllamav2 = True
if not disable_exllamav2 and not disable_exllama:
logger.warning(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment