Unverified Commit 48d0123f authored by Robert Dargavel Smith's avatar Robert Dargavel Smith Committed by GitHub
Browse files

add AudioDiffusionPipeline and LatentAudioDiffusionPipeline #1334 (#1426)



* add AudioDiffusionPipeline and LatentAudioDiffusionPipeline

* add docs to toc

* fix tests

* fix tests

* fix tests

* fix tests

* fix tests

* Update pr_tests.yml

Fix tests

* parent 499ff34b3edc3e0c506313ab48f21514d8f58b09
author teticio <teticio@gmail.com> 1668765652 +0000
committer teticio <teticio@gmail.com> 1669041721 +0000

parent 499ff34b3edc3e0c506313ab48f21514d8f58b09
author teticio <teticio@gmail.com> 1668765652 +0000
committer teticio <teticio@gmail.com> 1669041704 +0000

add colab notebook

[Flax] Fix loading scheduler from subfolder (#1319)

[FLAX] Fix loading scheduler from subfolder

Fix/Enable all schedulers for in-painting (#1331)

* inpaint fix k lms

* onnox as well

* up

Correct path to schedlure (#1322)

* [Examples] Correct path

* uP

Avoid nested fix-copies (#1332)

* Avoid nested `# Copied from` statements during `make fix-copies`

* style

Fix img2img speed with LMS-Discrete Scheduler (#896)

Casting `self.sigmas` into a different dtype (the one of original_samples) is not advisable. In my img2img pipeline this leads to a long running time in the  `integrate.quad` call later on- by long I mean more than 10x slower.
Co-authored-by: default avatarAnton Lozhkov <anton@huggingface.co>

Fix the order of casts for onnx inpainting (#1338)

Legacy Inpainting Pipeline for Onnx Models (#1237)

* Add legacy inpainting pipeline compatibility for onnx

* remove commented out line

* Add onnx legacy inpainting test

* Fix slow decorators

* pep8 styling

* isort styling

* dummy object

* ordering consistency

* style

* docstring styles

* Refactor common prompt encoding pattern

* Update tests to permanent repository home

* support all available schedulers until ONNX IO binding is available
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>

* updated styling from PR suggested feedback
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>

Jax infer support negative prompt (#1337)

* support negative prompts in sd jax pipeline

* pass batched neg_prompt

* only encode when negative prompt is None
Co-authored-by: default avatarJuan Acevedo <jfacevedo@google.com>

Update README.md: Minor change to Imagic code snippet, missing dir error (#1347)

Minor change to Imagic Readme

Missing dir causes an error when running the example code.

make style

change the sample model (#1352)

* Update alt_diffusion.mdx

* Update alt_diffusion.mdx

Add bit diffusion [WIP] (#971)

* Create bit_diffusion.py

Bit diffusion based on the paper, arXiv:2208.04202, Chen2022AnalogBG

* adding bit diffusion to new branch

ran tests

* tests

* tests

* tests

* tests

* removed test folders + added to README

* Update README.md
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* move Mel to module in pipeline construction, make librosa optional

* fix imports

* fix copy & paste error in comment

* fix style

* add missing register_to_config

* fix class docstrings

* fix class docstrings

* tweak docstrings

* tweak docstrings

* update slow test

* put trailing commas back

* respect alphabetical order

* remove LatentAudioDiffusion, make vqvae optional

* move Mel from models back to pipelines :-)

* allow loading of pretrained audiodiffusion models

* fix tests

* fix dummies

* remove reference to latent_audio_diffusion in docs

* unused import

* inherit from SchedulerMixin to make loadable

* Apply suggestions from code review

* Apply suggestions from code review
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent 459b8ca8
...@@ -26,6 +26,7 @@ from .import_utils import ( ...@@ -26,6 +26,7 @@ from .import_utils import (
is_accelerate_available, is_accelerate_available,
is_flax_available, is_flax_available,
is_inflect_available, is_inflect_available,
is_librosa_available,
is_modelcards_available, is_modelcards_available,
is_onnx_available, is_onnx_available,
is_safetensors_available, is_safetensors_available,
......
...@@ -152,6 +152,21 @@ class DiffusionPipeline(metaclass=DummyObject): ...@@ -152,6 +152,21 @@ class DiffusionPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class AudioDiffusionPipeline(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class DanceDiffusionPipeline(metaclass=DummyObject): class DanceDiffusionPipeline(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
...@@ -242,6 +257,21 @@ class LDMSuperResolutionPipeline(metaclass=DummyObject): ...@@ -242,6 +257,21 @@ class LDMSuperResolutionPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class Mel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class PNDMPipeline(metaclass=DummyObject): class PNDMPipeline(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
......
# This file is autogenerated by the command `make fix-copies`, do not edit.
# flake8: noqa
from ..utils import DummyObject, requires_backends
class AudioDiffusionPipeline(metaclass=DummyObject):
_backends = ["torch", "librosa"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "librosa"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "librosa"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "librosa"])
class Mel(metaclass=DummyObject):
_backends = ["torch", "librosa"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "librosa"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "librosa"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "librosa"])
...@@ -180,10 +180,17 @@ if _onnx_available: ...@@ -180,10 +180,17 @@ if _onnx_available:
_scipy_available = importlib.util.find_spec("scipy") is not None _scipy_available = importlib.util.find_spec("scipy") is not None
try: try:
_scipy_version = importlib_metadata.version("scipy") _scipy_version = importlib_metadata.version("scipy")
logger.debug(f"Successfully imported transformers version {_scipy_version}") logger.debug(f"Successfully imported scipy version {_scipy_version}")
except importlib_metadata.PackageNotFoundError: except importlib_metadata.PackageNotFoundError:
_scipy_available = False _scipy_available = False
_librosa_available = importlib.util.find_spec("librosa") is not None
try:
_librosa_version = importlib_metadata.version("librosa")
logger.debug(f"Successfully imported librosa version {_librosa_version}")
except importlib_metadata.PackageNotFoundError:
_librosa_available = False
_accelerate_available = importlib.util.find_spec("accelerate") is not None _accelerate_available = importlib.util.find_spec("accelerate") is not None
try: try:
_accelerate_version = importlib_metadata.version("accelerate") _accelerate_version = importlib_metadata.version("accelerate")
...@@ -244,6 +251,10 @@ def is_scipy_available(): ...@@ -244,6 +251,10 @@ def is_scipy_available():
return _scipy_available return _scipy_available
def is_librosa_available():
return _librosa_available
def is_xformers_available(): def is_xformers_available():
return _xformers_available return _xformers_available
...@@ -282,6 +293,12 @@ SCIPY_IMPORT_ERROR = """ ...@@ -282,6 +293,12 @@ SCIPY_IMPORT_ERROR = """
scipy` scipy`
""" """
# docstyle-ignore
LIBROSA_IMPORT_ERROR = """
{0} requires the librosa library but it was not found in your environment. Checkout the instructions on the
installation page: https://librosa.org/doc/latest/install.html and follow the ones that match your environment.
"""
# docstyle-ignore # docstyle-ignore
TENSORFLOW_IMPORT_ERROR = """ TENSORFLOW_IMPORT_ERROR = """
{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the {0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
...@@ -311,6 +328,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -311,6 +328,7 @@ BACKENDS_MAPPING = OrderedDict(
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)), ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)),
("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
] ]
) )
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from diffusers import (
AudioDiffusionPipeline,
AutoencoderKL,
DDIMScheduler,
DDPMScheduler,
DiffusionPipeline,
Mel,
UNet2DModel,
)
from diffusers.utils import slow, torch_device
from diffusers.utils.testing_utils import require_torch_gpu
torch.backends.cuda.matmul.allow_tf32 = False
class PipelineFastTests(unittest.TestCase):
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
@property
def dummy_unet(self):
torch.manual_seed(0)
model = UNet2DModel(
sample_size=(32, 64),
in_channels=1,
out_channels=1,
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
return model
@property
def dummy_vqvae_and_unet(self):
torch.manual_seed(0)
vqvae = AutoencoderKL(
sample_size=(128, 64),
in_channels=1,
out_channels=1,
latent_channels=1,
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
)
unet = UNet2DModel(
sample_size=(64, 32),
in_channels=1,
out_channels=1,
layers_per_block=2,
block_out_channels=(128, 128),
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
)
return vqvae, unet
def test_audio_diffusion(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
mel = Mel()
scheduler = DDPMScheduler()
pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device=device).manual_seed(42)
output = pipe(generator=generator, steps=4)
audio = output.audios[0]
image = output.images[0]
generator = torch.Generator(device=device).manual_seed(42)
output = pipe(generator=generator, steps=4, return_dict=False)
image_from_tuple = output[0][0]
assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length)
assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1]
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
expected_slice = np.array([255, 255, 255, 0, 181, 0, 124, 0, 15, 255])
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0
scheduler = DDIMScheduler()
dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
pipe = AudioDiffusionPipeline(
vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
np.random.seed(0)
raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,))
generator = torch.Generator(device=device).manual_seed(42)
output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
image = output.images[0]
assert (
image.height == self.dummy_vqvae_and_unet[0].sample_size[0]
and image.width == self.dummy_vqvae_and_unet[0].sample_size[1]
)
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
@slow
@require_torch_gpu
class PipelineIntegrationTests(unittest.TestCase):
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
def test_audio_diffusion(self):
device = torch_device
pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device=device).manual_seed(42)
output = pipe(generator=generator)
audio = output.audios[0]
image = output.images[0]
assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length)
assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1]
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment