Updated _encode_prompt_with_clip and encode_prompt in train_dreamboth_sd3 (#9800)

* updated encode prompt and clip encod prompt --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

Updated _encode_prompt_with_clip and encode_prompt in train_dreamboth_sd3 (#9800)
* updated encode prompt and clip encod prompt --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
76b7d86a · SahilCarterr · GitHub · e2b3c248 · 76b7d86a
Unverified Commit 76b7d86a authored Nov 06, 2024 by SahilCarterr Committed by GitHub Nov 05, 2024
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 9 deletions

examples/dreambooth/train_dreambooth_sd3.py examples/dreambooth/train_dreambooth_sd3.py +17 -9

No files found.
--- a/examples/dreambooth/train_dreambooth_sd3.py
+++ b/examples/dreambooth/train_dreambooth_sd3.py
@@ -902,11 +902,13 @@ def _encode_prompt_with_clip(
    tokenizer,
    prompt: str,
    device=None,
+    text_input_ids=None,
    num_images_per_prompt: int = 1,
 ):
    prompt = [prompt] if isinstance(prompt, str) else prompt
    batch_size = len(prompt)
+    if tokenizer is not None:
        text_inputs = tokenizer(
            prompt,
            padding="max_length",
@@ -916,6 +918,10 @@ def _encode_prompt_with_clip(
        )
        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
    pooled_prompt_embeds = prompt_embeds[0]
@@ -937,6 +943,7 @@ def encode_prompt(
    max_sequence_length,
    device=None,
    num_images_per_prompt: int = 1,
+    text_input_ids_list=None,
 ):
    prompt = [prompt] if isinstance(prompt, str) else prompt
@@ -945,13 +952,14 @@ def encode_prompt(
    clip_prompt_embeds_list = []
    clip_pooled_prompt_embeds_list = []
-    for tokenizer, text_encoder in zip(clip_tokenizers, clip_text_encoders):
+    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            prompt=prompt,
            device=device if device is not None else text_encoder.device,
            num_images_per_prompt=num_images_per_prompt,
+            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
        )
        clip_prompt_embeds_list.append(prompt_embeds)
        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)