save intermediate

acb948bd · Patrick von Platen · 8b8a339c · 8b8a339c · 8b8a339c · 8b8a339c
Commit acb948bd authored Jun 12, 2022 by Patrick von Platen
20 changed files
--- a/models/__init__.py
+++ b/models/__init__.py
--- a/models/audio/fastdiff/README.md
+++ b/models/audio/fastdiff/README.md
--- a/models/audio/fastdiff/modeling_fastdiff.py
+++ b/models/audio/fastdiff/modeling_fastdiff.py
--- a/models/audio/fastdiff/run_fastdiff.py
+++ b/models/audio/fastdiff/run_fastdiff.py
--- a/models/vision/__init__.py
+++ b/models/vision/__init__.py
--- a/models/vision/dalle2/README.md
+++ b/models/vision/dalle2/README.md
--- a/models/vision/dalle2/modeling_dalle2.py
+++ b/models/vision/dalle2/modeling_dalle2.py
--- a/models/vision/dalle2/run_dalle2.py
+++ b/models/vision/dalle2/run_dalle2.py
--- a/models/vision/ddim/README.md
+++ b/models/vision/ddim/README.md
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# Denoising Diffusion Implicit Models (DDIM)
-## Overview
-DDPM was proposed in [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) by *Jiaming Song, Chenlin Meng, Stefano Ermon*
-The abstract from the paper is the following:
-*Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.*
-Tips:
- ...
- ...
-This model was contributed by [???](https://huggingface.co/???). The original code can be found [here](https://github.com/hojonathanho/diffusion).
--- a/models/vision/ddim/example.py
+++ b/models/vision/ddim/example.py
-#!/usr/bin/env python3
-import os
-import pathlib
-import numpy as np
-import PIL.Image
-from modeling_ddim import DDIM
-model_ids = ["ddim-celeba-hq", "ddim-lsun-church", "ddim-lsun-bedroom"]
-for model_id in model_ids:
-    path = os.path.join("/home/patrick/images/hf", model_id)
-    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-    ddpm = DDIM.from_pretrained("fusing/" + model_id)
-    image = ddpm(batch_size=4)
-    image_processed = image.cpu().permute(0, 2, 3, 1)
-    image_processed = (image_processed + 1.0) * 127.5
-    image_processed = image_processed.numpy().astype(np.uint8)
-    for i in range(image_processed.shape[0]):
-        image_pil = PIL.Image.fromarray(image_processed[i])
-        image_pil.save(os.path.join(path, f"image_{i}.png"))
--- a/models/vision/ddim/modeling_ddim.py
+++ b/models/vision/ddim/modeling_ddim.py
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import tqdm
-from diffusers import DiffusionPipeline
-class DDIM(DiffusionPipeline):
-    def __init__(self, unet, noise_scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, noise_scheduler=noise_scheduler)
-    def __call__(self, batch_size=1, generator=None, torch_device=None, eta=0.0, num_inference_steps=50):
-        # eta corresponds to η in paper and should be between [0, 1]
-        if torch_device is None:
-            torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.unet.to(torch_device)
-        # Sample gaussian noise to begin loop
-        image = self.noise_scheduler.sample_noise(
-            (batch_size, self.unet.in_channels, self.unet.resolution, self.unet.resolution),
-            device=torch_device,
-            generator=generator,
-        )
-        for t in tqdm.tqdm(reversed(range(num_inference_steps)), total=num_inference_steps):
-            # 1. predict noise residual
-            orig_t = self.noise_scheduler.get_orig_t(t, num_inference_steps)
-            with torch.no_grad():
-                residual = self.unet(image, orig_t)
-            # 2. predict previous mean of image x_t-1
-            pred_prev_image = self.noise_scheduler.compute_prev_image_step(residual, image, t, num_inference_steps, eta)
-            # 3. optionally sample variance
-            variance = 0
-            if eta > 0:
-                noise = self.noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
-                variance = self.noise_scheduler.get_variance(t).sqrt() * eta * noise
-            # 4. set current image to prev_image: x_t -> x_t-1
-            image = pred_prev_image + variance
-        return image
--- a/models/vision/ddim/run_ddpm.py
+++ b/models/vision/ddim/run_ddpm.py
-#!/usr/bin/env python3
-import torch
-from diffusers import GaussianDDPMScheduler, UNetModel
-model = UNetModel(dim=64, dim_mults=(1, 2, 4, 8))
-diffusion = GaussianDDPMScheduler(model, image_size=128, timesteps=1000, loss_type="l1")  # number of steps  # L1 or L2
-training_images = torch.randn(8, 3, 128, 128)  # your images need to be normalized from a range of -1 to +1
-loss = diffusion(training_images)
-loss.backward()
-# after a lot of training
-sampled_images = diffusion.sample(batch_size=4)
-sampled_images.shape  # (4, 3, 128, 128)
--- a/models/vision/ddim/run_inference.py
+++ b/models/vision/ddim/run_inference.py
-#!/usr/bin/env python3
-# !pip install diffusers
-import numpy as np
-import PIL.Image
-from modeling_ddim import DDIM
-model_id = "fusing/ddpm-cifar10"
-model_id = "fusing/ddpm-lsun-bedroom"
-# load model and scheduler
-ddpm = DDIM.from_pretrained(model_id)
-# run pipeline in inference (sample random noise and denoise)
-image = ddpm()
-# process image to PIL
-image_processed = image.cpu().permute(0, 2, 3, 1)
-image_processed = (image_processed + 1.0) * 127.5
-image_processed = image_processed.numpy().astype(np.uint8)
-image_pil = PIL.Image.fromarray(image_processed[0])
-# save image
-image_pil.save("/home/patrick/images/show.png")
--- a/models/vision/ddpm/README.md
+++ b/models/vision/ddpm/README.md
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# Denoising Diffusion Probabilistic Models (DDPM)
-## Overview
-DDPM was proposed in [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) by *Jonathan Ho, Ajay Jain, Pieter Abbeel*.
-The abstract from the paper is the following:
-*We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at this https URL*
-Tips:
- ...
- ...
-This model was contributed by [???](https://huggingface.co/???). The original code can be found [here](https://github.com/hojonathanho/diffusion).
-![ddpm](https://user-images.githubusercontent.com/23423619/171627620-e3406711-1e20-4a99-8e30-ec5a86a465be.png)
--- a/models/vision/ddpm/example.py
+++ b/models/vision/ddpm/example.py
-#!/usr/bin/env python3
-import os
-import pathlib
-import numpy as np
-import PIL.Image
-from modeling_ddpm import DDPM
-model_ids = [
-    "ddpm-lsun-cat",
-    "ddpm-lsun-cat-ema",
-    "ddpm-lsun-church-ema",
-    "ddpm-lsun-church",
-    "ddpm-lsun-bedroom",
-    "ddpm-lsun-bedroom-ema",
-    "ddpm-cifar10-ema",
-    "ddpm-cifar10",
-    "ddpm-celeba-hq",
-    "ddpm-celeba-hq-ema",
-]
-for model_id in model_ids:
-    path = os.path.join("/home/patrick/images/hf", model_id)
-    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
-    ddpm = DDPM.from_pretrained("fusing/" + model_id)
-    image = ddpm(batch_size=4)
-    image_processed = image.cpu().permute(0, 2, 3, 1)
-    image_processed = (image_processed + 1.0) * 127.5
-    image_processed = image_processed.numpy().astype(np.uint8)
-    for i in range(image_processed.shape[0]):
-        image_pil = PIL.Image.fromarray(image_processed[i])
-        image_pil.save(os.path.join(path, f"image_{i}.png"))
--- a/models/vision/ddpm/modeling_ddpm.py
+++ b/models/vision/ddpm/modeling_ddpm.py
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import tqdm
-from diffusers import DiffusionPipeline
-class DDPM(DiffusionPipeline):
-    def __init__(self, unet, noise_scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, noise_scheduler=noise_scheduler)
-    def __call__(self, batch_size=1, generator=None, torch_device=None):
-        if torch_device is None:
-            torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.unet.to(torch_device)
-        # Sample gaussian noise to begin loop
-        image = self.noise_scheduler.sample_noise(
-            (batch_size, self.unet.in_channels, self.unet.resolution, self.unet.resolution),
-            device=torch_device,
-            generator=generator,
-        )
-        num_prediction_steps = len(self.noise_scheduler)
-        for t in tqdm.tqdm(reversed(range(num_prediction_steps)), total=num_prediction_steps):
-            # 1. predict noise residual
-            with torch.no_grad():
-                residual = self.unet(image, t)
-            # 2. predict previous mean of image x_t-1
-            pred_prev_image = self.noise_scheduler.compute_prev_image_step(residual, image, t)
-            # 3. optionally sample variance
-            variance = 0
-            if t > 0:
-                noise = self.noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
-                variance = self.noise_scheduler.get_variance(t).sqrt() * noise
-            # 4. set current image to prev_image: x_t -> x_t-1
-            image = pred_prev_image + variance
-        return image
--- a/models/vision/ddpm/run_ddpm.py
+++ b/models/vision/ddpm/run_ddpm.py
-#!/usr/bin/env python3
-import torch
-from diffusers import GaussianDDPMScheduler, UNetModel
-model = UNetModel(dim=64, dim_mults=(1, 2, 4, 8))
-diffusion = GaussianDDPMScheduler(model, image_size=128, timesteps=1000, loss_type="l1")  # number of steps  # L1 or L2
-training_images = torch.randn(8, 3, 128, 128)  # your images need to be normalized from a range of -1 to +1
-loss = diffusion(training_images)
-loss.backward()
-# after a lot of training
-sampled_images = diffusion.sample(batch_size=4)
-sampled_images.shape  # (4, 3, 128, 128)
--- a/models/vision/glide/README.md
+++ b/models/vision/glide/README.md
-# References
-[GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models](https://arxiv.org/pdf/2112.10741.pdf)
-[Diffusion Models Beat GANs on Image Synthesis](https://arxiv.org/pdf/2105.05233.pdf)
\ No newline at end of file
--- a/models/vision/glide/convert_weights.py
+++ b/models/vision/glide/convert_weights.py
-import torch
-from torch import nn
-from diffusers import (
-    ClassifierFreeGuidanceScheduler,
-    GlideDDIMScheduler,
-    GLIDESuperResUNetModel,
-    GLIDETextToImageUNetModel,
-)
-from modeling_glide import GLIDE, CLIPTextModel
-from transformers import CLIPTextConfig, GPT2Tokenizer
-# wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/base.pt
-state_dict = torch.load("base.pt", map_location="cpu")
-state_dict = {k: nn.Parameter(v) for k, v in state_dict.items()}
-### Convert the text encoder
-config = CLIPTextConfig(
-    vocab_size=50257,
-    max_position_embeddings=128,
-    hidden_size=512,
-    intermediate_size=2048,
-    num_hidden_layers=16,
-    num_attention_heads=8,
-    use_padding_embeddings=True,
-)
-model = CLIPTextModel(config).eval()
-tokenizer = GPT2Tokenizer(
-    "./glide-base/tokenizer/vocab.json", "./glide-base/tokenizer/merges.txt", pad_token="<|endoftext|>"
-)
-hf_encoder = model.text_model
-hf_encoder.embeddings.token_embedding.weight = state_dict["token_embedding.weight"]
-hf_encoder.embeddings.position_embedding.weight.data = state_dict["positional_embedding"]
-hf_encoder.embeddings.padding_embedding.weight.data = state_dict["padding_embedding"]
-hf_encoder.final_layer_norm.weight = state_dict["final_ln.weight"]
-hf_encoder.final_layer_norm.bias = state_dict["final_ln.bias"]
-for layer_idx in range(config.num_hidden_layers):
-    hf_layer = hf_encoder.encoder.layers[layer_idx]
-    hf_layer.self_attn.qkv_proj.weight = state_dict[f"transformer.resblocks.{layer_idx}.attn.c_qkv.weight"]
-    hf_layer.self_attn.qkv_proj.bias = state_dict[f"transformer.resblocks.{layer_idx}.attn.c_qkv.bias"]
-    hf_layer.self_attn.out_proj.weight = state_dict[f"transformer.resblocks.{layer_idx}.attn.c_proj.weight"]
-    hf_layer.self_attn.out_proj.bias = state_dict[f"transformer.resblocks.{layer_idx}.attn.c_proj.bias"]
-    hf_layer.layer_norm1.weight = state_dict[f"transformer.resblocks.{layer_idx}.ln_1.weight"]
-    hf_layer.layer_norm1.bias = state_dict[f"transformer.resblocks.{layer_idx}.ln_1.bias"]
-    hf_layer.layer_norm2.weight = state_dict[f"transformer.resblocks.{layer_idx}.ln_2.weight"]
-    hf_layer.layer_norm2.bias = state_dict[f"transformer.resblocks.{layer_idx}.ln_2.bias"]
-    hf_layer.mlp.fc1.weight = state_dict[f"transformer.resblocks.{layer_idx}.mlp.c_fc.weight"]
-    hf_layer.mlp.fc1.bias = state_dict[f"transformer.resblocks.{layer_idx}.mlp.c_fc.bias"]
-    hf_layer.mlp.fc2.weight = state_dict[f"transformer.resblocks.{layer_idx}.mlp.c_proj.weight"]
-    hf_layer.mlp.fc2.bias = state_dict[f"transformer.resblocks.{layer_idx}.mlp.c_proj.bias"]
-### Convert the Text-to-Image UNet
-text2im_model = GLIDETextToImageUNetModel(
-    in_channels=3,
-    model_channels=192,
-    out_channels=6,
-    num_res_blocks=3,
-    attention_resolutions=(2, 4, 8),
-    dropout=0.1,
-    channel_mult=(1, 2, 3, 4),
-    num_heads=1,
-    num_head_channels=64,
-    num_heads_upsample=1,
-    use_scale_shift_norm=True,
-    resblock_updown=True,
-    transformer_dim=512,
-)
-text2im_model.load_state_dict(state_dict, strict=False)
-text_scheduler = ClassifierFreeGuidanceScheduler(timesteps=1000, beta_schedule="squaredcos_cap_v2")
-### Convert the Super-Resolution UNet
-# wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/upsample.pt
-ups_state_dict = torch.load("upsample.pt", map_location="cpu")
-superres_model = GLIDESuperResUNetModel(
-    in_channels=6,
-    model_channels=192,
-    out_channels=6,
-    num_res_blocks=2,
-    attention_resolutions=(8, 16, 32),
-    dropout=0.1,
-    channel_mult=(1, 1, 2, 2, 4, 4),
-    num_heads=1,
-    num_head_channels=64,
-    num_heads_upsample=1,
-    use_scale_shift_norm=True,
-    resblock_updown=True,
-)
-superres_model.load_state_dict(ups_state_dict, strict=False)
-upscale_scheduler = GlideDDIMScheduler(timesteps=1000, beta_schedule="linear")
-glide = GLIDE(
-    text_unet=text2im_model,
-    text_noise_scheduler=text_scheduler,
-    text_encoder=model,
-    tokenizer=tokenizer,
-    upscale_unet=superres_model,
-    upscale_noise_scheduler=upscale_scheduler,
-)
-glide.save_pretrained("./glide-base")
--- a/models/vision/glide/modeling_glide.py
+++ b/models/vision/glide/modeling_glide.py