apply amp bf16 on textual inversion (#1465)

* add conf.yaml * enable bf16 enable amp bf16 for unet forward fix style fix readme remove useless file * change amp to full bf16 * align * make stype * fix format

apply amp bf16 on textual inversion (#1465)
* add conf.yaml * enable bf16 enable amp bf16 for unet forward fix style fix readme remove useless file * change amp to full bf16 * align * make stype * fix format
c5f04d4e · jiqing-feng · GitHub · 61dec533 · c5f04d4e
Unverified Commit c5f04d4e authored Dec 16, 2022 by jiqing-feng Committed by GitHub Dec 15, 2022
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

examples/textual_inversion/textual_inversion.py examples/textual_inversion/textual_inversion.py +12 -6

No files found.
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -532,9 +532,15 @@ def main():
    )
    accelerator.register_for_checkpointing(lr_scheduler)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
    # Move vae and unet to device
-    vae.to(accelerator.device)
+    unet.to(accelerator.device, dtype=weight_dtype)
-    unet.to(accelerator.device)
+    vae.to(accelerator.device, dtype=weight_dtype)
    # Keep vae and unet in eval model as we don't train these
    vae.eval()
@@ -600,11 +606,11 @@ def main():
            with accelerator.accumulate(text_encoder):
                # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
                latents = latents * 0.18215
                # Sample noise that we'll add to the latents
-                noise = torch.randn(latents.shape).to(latents.device)
+                noise = torch.randn(latents.shape).to(latents.device).to(dtype=weight_dtype)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
                timesteps = torch.randint(
@@ -616,7 +622,7 @@ def main():
                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
                # Predict the noise residual
                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
@@ -629,7 +635,7 @@ def main():
                else:
                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-                loss = F.mse_loss(model_pred, target, reduction="none").mean([1, 2, 3]).mean()
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
                accelerator.backward(loss)
                optimizer.step()