[`FA-2`] Revert suggestion that broke FA2 fine-tuning with quantized models (#26916)

revert

[`FA-2`] Revert suggestion that broke FA2 fine-tuning with quantized models (#26916)
revert
574a5384 · Younes Belkada · GitHub · caa0ff0b · 574a5384 · 574a5384
Unverified Commit 574a5384 authored Oct 19, 2023 by Younes Belkada Committed by GitHub Oct 19, 2023
3 changed files
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -614,7 +614,10 @@ class FalconFlashAttention2(FalconAttention):
        input_dtype = query_layer.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
-            target_dtype = getattr(self.config, "_pre_quantization_dtype", self.query_key_value.weight.dtype)
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.query_key_value.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"

--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -476,7 +476,10 @@ class LlamaFlashAttention2(LlamaAttention):
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
-            target_dtype = getattr(self.config, "_pre_quantization_dtype", self.q_proj.weight.dtype)
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"

--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -409,7 +409,10 @@ class MistralFlashAttention2(MistralAttention):
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
-            target_dtype = getattr(self.config, "_pre_quantization_dtype", self.q_proj.weight.dtype)
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"