Commit 9b7e6f49 authored by Patrick von Platen's avatar Patrick von Platen
Browse files

Merge branch 'main' of https://github.com/huggingface/diffusers into main

parents df64f624 1fd02631
...@@ -225,9 +225,6 @@ class ConfigMixin: ...@@ -225,9 +225,6 @@ class ConfigMixin:
text = reader.read() text = reader.read()
return json.loads(text) return json.loads(text)
def __eq__(self, other):
return self.__dict__ == other.__dict__
def __repr__(self): def __repr__(self):
return f"{self.__class__.__name__} {self.to_json_string()}" return f"{self.__class__.__name__} {self.to_json_string()}"
......
...@@ -832,12 +832,12 @@ class GLIDE(DiffusionPipeline): ...@@ -832,12 +832,12 @@ class GLIDE(DiffusionPipeline):
# 1. Sample gaussian noise # 1. Sample gaussian noise
batch_size = 2 # second image is empty for classifier-free guidance batch_size = 2 # second image is empty for classifier-free guidance
image = self.text_noise_scheduler.sample_noise( image = torch.randn(
(batch_size, self.text_unet.in_channels, 64, 64), device=torch_device, generator=generator (batch_size, self.text_unet.in_channels, 64, 64), generator=generator
) ).to(torch_device)
# 2. Encode tokens # 2. Encode tokens
# an empty input is needed to guide the model away from ( # an empty input is needed to guide the model away from it
inputs = self.tokenizer([prompt, ""], padding="max_length", max_length=128, return_tensors="pt") inputs = self.tokenizer([prompt, ""], padding="max_length", max_length=128, return_tensors="pt")
input_ids = inputs["input_ids"].to(torch_device) input_ids = inputs["input_ids"].to(torch_device)
attention_mask = inputs["attention_mask"].to(torch_device) attention_mask = inputs["attention_mask"].to(torch_device)
...@@ -850,7 +850,7 @@ class GLIDE(DiffusionPipeline): ...@@ -850,7 +850,7 @@ class GLIDE(DiffusionPipeline):
mean, variance, log_variance, pred_xstart = self.p_mean_variance( mean, variance, log_variance, pred_xstart = self.p_mean_variance(
text_model_fn, self.text_noise_scheduler, image, t, transformer_out=transformer_out text_model_fn, self.text_noise_scheduler, image, t, transformer_out=transformer_out
) )
noise = self.text_noise_scheduler.sample_noise(image.shape, device=torch_device, generator=generator) noise = torch.randn(image.shape, generator=generator).to(torch_device)
nonzero_mask = (t != 0).float().view(-1, *([1] * (len(image.shape) - 1))) # no noise when t == 0 nonzero_mask = (t != 0).float().view(-1, *([1] * (len(image.shape) - 1))) # no noise when t == 0
image = mean + nonzero_mask * torch.exp(0.5 * log_variance) * noise image = mean + nonzero_mask * torch.exp(0.5 * log_variance) * noise
...@@ -873,8 +873,8 @@ class GLIDE(DiffusionPipeline): ...@@ -873,8 +873,8 @@ class GLIDE(DiffusionPipeline):
self.upscale_unet.resolution, self.upscale_unet.resolution,
), ),
generator=generator, generator=generator,
) ).to(torch_device)
image = image.to(torch_device) * upsample_temp image = image * upsample_temp
num_trained_timesteps = self.upscale_noise_scheduler.timesteps num_trained_timesteps = self.upscale_noise_scheduler.timesteps
inference_step_times = range(0, num_trained_timesteps, num_trained_timesteps // num_inference_steps_upscale) inference_step_times = range(0, num_trained_timesteps, num_trained_timesteps // num_inference_steps_upscale)
...@@ -896,7 +896,7 @@ class GLIDE(DiffusionPipeline): ...@@ -896,7 +896,7 @@ class GLIDE(DiffusionPipeline):
# 3. optionally sample variance # 3. optionally sample variance
variance = 0 variance = 0
if eta > 0: if eta > 0:
noise = torch.randn(image.shape, generator=generator).to(image.device) noise = torch.randn(image.shape, generator=generator).to(torch_device)
variance = ( variance = (
self.upscale_noise_scheduler.get_variance(t, num_inference_steps_upscale).sqrt() * eta * noise self.upscale_noise_scheduler.get_variance(t, num_inference_steps_upscale).sqrt() * eta * noise
) )
......
...@@ -19,7 +19,7 @@ import unittest ...@@ -19,7 +19,7 @@ import unittest
import torch import torch
from diffusers import DDIM, DDPM, PNDM, DDIMScheduler, DDPMScheduler, LatentDiffusion, PNDMScheduler, UNetModel from diffusers import DDIM, DDPM, PNDM, GLIDE, DDIMScheduler, DDPMScheduler, LatentDiffusion, PNDMScheduler, UNetModel
from diffusers.configuration_utils import ConfigMixin from diffusers.configuration_utils import ConfigMixin
from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.testing_utils import floats_tensor, slow, torch_device from diffusers.testing_utils import floats_tensor, slow, torch_device
...@@ -212,3 +212,18 @@ class PipelineTesterMixin(unittest.TestCase): ...@@ -212,3 +212,18 @@ class PipelineTesterMixin(unittest.TestCase):
assert image.shape == (1, 3, 256, 256) assert image.shape == (1, 3, 256, 256)
expected_slice = torch.tensor([0.7295, 0.7358, 0.7256, 0.7435, 0.7095, 0.6884, 0.7325, 0.6921, 0.6458]) expected_slice = torch.tensor([0.7295, 0.7358, 0.7256, 0.7435, 0.7095, 0.6884, 0.7325, 0.6921, 0.6458])
assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2 assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
@slow
def test_glide_text2img(self):
model_id = "fusing/glide-base"
glide = GLIDE.from_pretrained(model_id)
prompt = "a pencil sketch of a corgi"
generator = torch.manual_seed(0)
image = glide(prompt, generator=generator, num_inference_steps_upscale=20)
image_slice = image[0, :3, :3, -1].cpu()
assert image.shape == (1, 256, 256, 3)
expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment