test_dance_diffusion.py 5.5 KB
Newer Older
1
# coding=utf-8
Patrick von Platen's avatar
Patrick von Platen committed
2
# Copyright 2023 HuggingFace Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import unittest

import numpy as np
import torch

from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
Dhruv Nair's avatar
Dhruv Nair committed
23
from diffusers.utils import nightly, torch_device
24
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps
25

26
27
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
28

29

30
enable_full_determinism()
31
32


33
34
class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = DanceDiffusionPipeline
35
36
37
38
39
40
41
42
43
    params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
    required_optional_params = PipelineTesterMixin.required_optional_params - {
        "callback",
        "latents",
        "callback_steps",
        "output_type",
        "num_images_per_prompt",
    }
    batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
44
    test_attention_slicing = False
45

46
    def get_dummy_components(self):
47
        torch.manual_seed(0)
48
        unet = UNet1DModel(
49
50
51
52
53
54
            block_out_channels=(32, 32, 64),
            extra_in_channels=16,
            sample_size=512,
            sample_rate=16_000,
            in_channels=2,
            out_channels=2,
55
56
57
58
            flip_sin_to_cos=True,
            use_timestep_embedding=False,
            time_embedding_type="fourier",
            mid_block_type="UNetMidBlock1D",
59
60
            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
61
        )
62
63
64
65
66
67
68
69
70
71
72
73
74
75
        scheduler = IPNDMScheduler()

        components = {
            "unet": unet,
            "scheduler": scheduler,
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device=device).manual_seed(seed)
        inputs = {
76
            "batch_size": 1,
77
78
79
80
            "generator": generator,
            "num_inference_steps": 4,
        }
        return inputs
81
82
83

    def test_dance_diffusion(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
84
85
        components = self.get_dummy_components()
        pipe = DanceDiffusionPipeline(**components)
86
87
88
        pipe = pipe.to(device)
        pipe.set_progress_bar_config(disable=None)

89
90
        inputs = self.get_dummy_inputs(device)
        output = pipe(**inputs)
91
92
93
94
        audio = output.audios

        audio_slice = audio[0, -3:, -3:]

95
        assert audio.shape == (1, 2, components["unet"].sample_size)
96
97
98
        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2

99
100
101
102
103
104
    @skip_mps
    def test_save_load_local(self):
        return super().test_save_load_local()

    @skip_mps
    def test_dict_tuple_outputs_equivalent(self):
105
        return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
106
107
108
109
110
111
112
113
114

    @skip_mps
    def test_save_load_optional_components(self):
        return super().test_save_load_optional_components()

    @skip_mps
    def test_attention_slicing_forward_pass(self):
        return super().test_attention_slicing_forward_pass()

115
116
117
    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)

118

Dhruv Nair's avatar
Dhruv Nair committed
119
@nightly
120
121
122
123
124
125
126
127
128
129
130
@require_torch_gpu
class PipelineIntegrationTests(unittest.TestCase):
    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def test_dance_diffusion(self):
        device = torch_device

131
        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
132
133
134
        pipe = pipe.to(device)
        pipe.set_progress_bar_config(disable=None)

135
        generator = torch.manual_seed(0)
136
        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
137
138
139
140
141
        audio = output.audios

        audio_slice = audio[0, -3:, -3:]

        assert audio.shape == (1, 2, pipe.unet.sample_size)
142
143
        expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])

144
        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
145
146
147
148

    def test_dance_diffusion_fp16(self):
        device = torch_device

149
        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
150
151
152
        pipe = pipe.to(device)
        pipe.set_progress_bar_config(disable=None)

153
        generator = torch.manual_seed(0)
154
        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
155
156
157
158
159
        audio = output.audios

        audio_slice = audio[0, -3:, -3:]

        assert audio.shape == (1, 2, pipe.unet.sample_size)
160
161
        expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])

162
        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2