Added cache_block_outputs option to enable GPTQ for non-regular models (#27032)

* Added cache_block_outputs option to enable GPTQ for non-regular models * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Fixed style * Update src/transformers/utils/quantization_config.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Added cache_block_outputs option to enable GPTQ for non-regular models (#27032)
* Added cache_block_outputs option to enable GPTQ for non-regular models * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Fixed style * Update src/transformers/utils/quantization_config.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
f9b4bea0 · Alexander Kozlov · GitHub · 037fb7d0 · f9b4bea0
Unverified Commit f9b4bea0 authored Nov 01, 2023 by Alexander Kozlov Committed by GitHub Nov 01, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

src/transformers/utils/quantization_config.py src/transformers/utils/quantization_config.py +4 -0

No files found.
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -360,6 +360,8 @@ class GPTQConfig(QuantizationConfigMixin):
        max_input_length (`int`, *optional*):
            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
            length. It is specific to the exllama backend with act-order.
+        cache_block_outputs (`bool`, *optional*, defaults to `True`):
+                Whether to cache block outputs to reuse as inputs for the succeeding block.
    """

    def __init__(
@@ -380,6 +382,7 @@ class GPTQConfig(QuantizationConfigMixin):
        pad_token_id: Optional[int] = None,
        disable_exllama: bool = False,
        max_input_length: Optional[int] = None,
+        cache_block_outputs: bool = True,
        **kwargs,
    ):
        self.quant_method = QuantizationMethod.GPTQ
@@ -399,6 +402,7 @@ class GPTQConfig(QuantizationConfigMixin):
        self.pad_token_id = pad_token_id
        self.disable_exllama = disable_exllama
        self.max_input_length = max_input_length
+        self.cache_block_outputs = cache_block_outputs
        self.post_init()

    def get_loading_attributes(self):