Fix #12116: preserve boolean dtype for attention masks in ChromaPipeline (#12263)

* fix: preserve boolean dtype for attention masks in ChromaPipeline - Convert attention masks to bool and prevent dtype corruption - Fix both positive and negative mask handling in _get_t5_prompt_embeds - Remove float conversion in _prepare_attention_mask method Fixes #12116 * test: add ChromaPipeline attention mask dtype tests * test: add slow ChromaPipeline attention mask tests * chore: removed comments * refactor: removing redundant type conversion * Remove dedicated dtype tests as per feedback --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

Fix #12116: preserve boolean dtype for attention masks in ChromaPipeline (#12263)
* fix: preserve boolean dtype for attention masks in ChromaPipeline - Convert attention masks to bool and prevent dtype corruption - Fix both positive and negative mask handling in _get_t5_prompt_embeds - Remove float conversion in _prepare_attention_mask method Fixes #12116 * test: add ChromaPipeline attention mask dtype tests * test: add slow ChromaPipeline attention mask tests * chore: removed comments * refactor: removing redundant type conversion * Remove dedicated dtype tests as per feedback --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
0a151115 · Akshay Babbar · GitHub · 19085ac8 · 0a151115
Unverified Commit 0a151115 authored Sep 29, 2025 by Akshay Babbar Committed by GitHub Sep 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

src/diffusers/pipelines/chroma/pipeline_chroma.py src/diffusers/pipelines/chroma/pipeline_chroma.py +3 -4

No files found.
--- a/src/diffusers/pipelines/chroma/pipeline_chroma.py
+++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py
@@ -238,7 +238,7 @@ class ChromaPipeline(
        # Chroma requires the attention mask to include one padding token
        seq_lengths = attention_mask.sum(dim=1)
        mask_indices = torch.arange(attention_mask.size(1)).unsqueeze(0).expand(batch_size, -1)
-        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).long()
+        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).bool()

        prompt_embeds = self.text_encoder(
            text_input_ids.to(device), output_hidden_states=False, attention_mask=attention_mask.to(device)
@@ -246,7 +246,7 @@ class ChromaPipeline(

        dtype = self.text_encoder.dtype
        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        attention_mask = attention_mask.to(dtype=dtype, device=device)
+        attention_mask = attention_mask.to(device=device)

        _, seq_len, _ = prompt_embeds.shape

@@ -605,10 +605,9 @@ class ChromaPipeline(

        # Extend the prompt attention mask to account for image tokens in the final sequence
        attention_mask = torch.cat(
-            [attention_mask, torch.ones(batch_size, sequence_length, device=attention_mask.device)],
+            [attention_mask, torch.ones(batch_size, sequence_length, device=attention_mask.device, dtype=torch.bool)],
            dim=1,
        )
-        attention_mask = attention_mask.to(dtype)

        return attention_mask