[SD3 Training] T5 token limit (#8564)

* initial commit * default back to 77 * better text * text correction --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

[SD3 Training] T5 token limit (#8564)
* initial commit * default back to 77 * better text * text correction --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
6bfd13f0 · Álvaro Somoza · GitHub · eeb70033 · 6bfd13f0 · 6bfd13f0
Unverified Commit 6bfd13f0 authored Jun 17, 2024 by Álvaro Somoza Committed by GitHub Jun 17, 2024
3 changed files
--- a/examples/dreambooth/README_sd3.md
+++ b/examples/dreambooth/README_sd3.md
@@ -106,6 +106,9 @@ To better track our training experiments, we're using the following flags in the
 * `report_to="wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`.
 * `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. 
+> [!NOTE]  
+> If you want to train using long prompts with the T5 text encoder, you can use `--max_sequence_length` to set the token limit. The default is 77, but it can be increased to as high as 512. Note that this will use more resources and may slow down the training in some cases.
 > [!TIP]
 > You can pass `--use_8bit_adam` to reduce the memory requirements of training. Make sure to install `bitsandbytes` if you want to do so.

--- a/examples/dreambooth/train_dreambooth_lora_sd3.py
+++ b/examples/dreambooth/train_dreambooth_lora_sd3.py
@@ -298,6 +298,12 @@ def parse_args(input_args=None):
        default=None,
        help="The prompt to specify images in the same class as provided instance images.",
    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
    parser.add_argument(
        "--validation_prompt",
        type=str,
@@ -830,6 +836,7 @@ def tokenize_prompt(tokenizer, prompt):
 def _encode_prompt_with_t5(
    text_encoder,
    tokenizer,
+    max_sequence_length,
    prompt=None,
    num_images_per_prompt=1,
    device=None,
@@ -840,7 +847,7 @@ def _encode_prompt_with_t5(
    text_inputs = tokenizer(
        prompt,
        padding="max_length",
-        max_length=77,
+        max_length=max_sequence_length,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
@@ -897,6 +904,7 @@ def encode_prompt(
    text_encoders,
    tokenizers,
    prompt: str,
+    max_sequence_length,
    device=None,
    num_images_per_prompt: int = 1,
 ):
@@ -924,6 +932,7 @@ def encode_prompt(
    t5_prompt_embed = _encode_prompt_with_t5(
        text_encoders[-1],
        tokenizers[-1],
+        max_sequence_length,
        prompt=prompt,
        num_images_per_prompt=num_images_per_prompt,
        device=device if device is not None else text_encoders[-1].device,
@@ -1297,7 +1306,9 @@ def main(args):
    def compute_text_embeddings(prompt, text_encoders, tokenizers):
        with torch.no_grad():
-            prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+            prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                text_encoders, tokenizers, prompt, args.max_sequence_length
+            )
            prompt_embeds = prompt_embeds.to(accelerator.device)
            pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
        return prompt_embeds, pooled_prompt_embeds

--- a/examples/dreambooth/train_dreambooth_sd3.py
+++ b/examples/dreambooth/train_dreambooth_sd3.py
@@ -297,6 +297,12 @@ def parse_args(input_args=None):
        default=None,
        help="The prompt to specify images in the same class as provided instance images.",
    )
+    parser.add_argument(
+        "--max_sequence_length",
+        type=int,
+        default=77,
+        help="Maximum sequence length to use with with the T5 text encoder",
+    )
    parser.add_argument(
        "--validation_prompt",
        type=str,
@@ -828,6 +834,7 @@ def tokenize_prompt(tokenizer, prompt):
 def _encode_prompt_with_t5(
    text_encoder,
    tokenizer,
+    max_sequence_length,
    prompt=None,
    num_images_per_prompt=1,
    device=None,
@@ -838,7 +845,7 @@ def _encode_prompt_with_t5(
    text_inputs = tokenizer(
        prompt,
        padding="max_length",
-        max_length=77,
+        max_length=max_sequence_length,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
@@ -895,6 +902,7 @@ def encode_prompt(
    text_encoders,
    tokenizers,
    prompt: str,
+    max_sequence_length,
    device=None,
    num_images_per_prompt: int = 1,
 ):
@@ -922,6 +930,7 @@ def encode_prompt(
    t5_prompt_embed = _encode_prompt_with_t5(
        text_encoders[-1],
        tokenizers[-1],
+        max_sequence_length,
        prompt=prompt,
        num_images_per_prompt=num_images_per_prompt,
        device=device if device is not None else text_encoders[-1].device,
@@ -1324,7 +1333,9 @@ def main(args):
        def compute_text_embeddings(prompt, text_encoders, tokenizers):
            with torch.no_grad():
-                prompt_embeds, pooled_prompt_embeds = encode_prompt(text_encoders, tokenizers, prompt)
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders, tokenizers, prompt, args.max_sequence_length
+                )
                prompt_embeds = prompt_embeds.to(accelerator.device)
                pooled_prompt_embeds = pooled_prompt_embeds.to(accelerator.device)
            return prompt_embeds, pooled_prompt_embeds