support bf16 for stable diffusion (#792)

* support bf16 for stable diffusion * fix typo * address review comments

support bf16 for stable diffusion (#792)
* support bf16 for stable diffusion * fix typo * address review comments
797b290e · Suraj Patil · GitHub · 81bdbb5e · 797b290e · 797b290e
Unverified Commit 797b290e authored Oct 11, 2022 by Suraj Patil Committed by GitHub Oct 11, 2022
3 changed files
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -41,6 +41,13 @@ class Upsample2D(nn.Module):
        if self.use_conv_transpose:
            return self.conv(hidden_states)
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
        # if `output_size` is passed we force the interpolation output
        # size and do not make use of `scale_factor=2`
        if output_size is None:
@@ -48,6 +55,10 @@ class Upsample2D(nn.Module):
        else:
            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
        if self.use_conv:
            if self.name == "conv":

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -327,7 +327,9 @@ class StableDiffusionPipeline(DiffusionPipeline):
        image = self.vae.decode(latents).sample
        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
        safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
        image, has_nsfw_concept = self.safety_checker(

--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -38,8 +38,9 @@ class StableDiffusionSafetyChecker(PreTrainedModel):
        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
        image_embeds = self.visual_projection(pooled_output)
-        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().numpy()
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().numpy()
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
        result = []
        batch_size = image_embeds.shape[0]