test_lumina_nextdit.py 4.56 KB
Newer Older
1
2
3
4
5
6
7
import gc
import unittest

import numpy as np
import torch
from transformers import AutoTokenizer, GemmaConfig, GemmaForCausalLM

8
9
10
11
12
13
from diffusers import (
    AutoencoderKL,
    FlowMatchEulerDiscreteScheduler,
    LuminaNextDiT2DModel,
    LuminaPipeline,
)
14
15

from ...testing_utils import (
16
    backend_empty_cache,
17
    numpy_cosine_similarity_distance,
18
    require_torch_accelerator,
19
20
21
22
23
24
    slow,
    torch_device,
)
from ..test_pipelines_common import PipelineTesterMixin


25
26
class LuminaPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
    pipeline_class = LuminaPipeline
27
28
29
30
31
32
33
34
35
36
37
38
39
    params = frozenset(
        [
            "prompt",
            "height",
            "width",
            "guidance_scale",
            "negative_prompt",
            "prompt_embeds",
            "negative_prompt_embeds",
        ]
    )
    batch_params = frozenset(["prompt", "negative_prompt"])

Marc Sun's avatar
Marc Sun committed
40
    supports_dduf = False
Aryan's avatar
Aryan committed
41
    test_layerwise_casting = True
Aryan's avatar
Aryan committed
42
    test_group_offloading = True
Marc Sun's avatar
Marc Sun committed
43

44
45
46
    def get_dummy_components(self):
        torch.manual_seed(0)
        transformer = LuminaNextDiT2DModel(
47
            sample_size=4,
48
49
            patch_size=2,
            in_channels=4,
50
            hidden_size=4,
51
            num_layers=2,
52
            num_attention_heads=1,
53
54
55
56
57
58
            num_kv_heads=1,
            multiple_of=16,
            ffn_dim_multiplier=None,
            norm_eps=1e-5,
            learn_sigma=True,
            qk_norm=True,
59
            cross_attention_dim=8,
60
61
62
63
64
65
66
67
68
69
            scaling_factor=1.0,
        )
        torch.manual_seed(0)
        vae = AutoencoderKL()

        scheduler = FlowMatchEulerDiscreteScheduler()
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")

        torch.manual_seed(0)
        config = GemmaConfig(
70
71
            head_dim=2,
            hidden_size=8,
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
            intermediate_size=37,
            num_attention_heads=4,
            num_hidden_layers=2,
            num_key_value_heads=4,
        )
        text_encoder = GemmaForCausalLM(config)

        components = {
            "transformer": transformer.eval(),
            "vae": vae.eval(),
            "scheduler": scheduler,
            "text_encoder": text_encoder.eval(),
            "tokenizer": tokenizer,
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 5.0,
            "output_type": "np",
        }
        return inputs

103
104
105
106
    @unittest.skip("xformers attention processor does not exist for Lumina")
    def test_xformers_attention_forwardGenerator_pass(self):
        pass

107
108

@slow
109
@require_torch_accelerator
110
111
class LuminaPipelineSlowTests(unittest.TestCase):
    pipeline_class = LuminaPipeline
112
113
114
115
116
    repo_id = "Alpha-VLLM/Lumina-Next-SFT-diffusers"

    def setUp(self):
        super().setUp()
        gc.collect()
117
        backend_empty_cache(torch_device)
118
119
120
121

    def tearDown(self):
        super().tearDown()
        gc.collect()
122
        backend_empty_cache(torch_device)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        return {
            "prompt": "A photo of a cat",
            "num_inference_steps": 2,
            "guidance_scale": 5.0,
            "output_type": "np",
            "generator": generator,
        }

    def test_lumina_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
140
        pipe.enable_model_cpu_offload(device=torch_device)
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

        inputs = self.get_inputs(torch_device)

        image = pipe(**inputs).images[0]
        image_slice = image[0, :10, :10]
        expected_slice = np.array(
            [
                [0.17773438, 0.18554688, 0.22070312],
                [0.046875, 0.06640625, 0.10351562],
                [0.0, 0.0, 0.02148438],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0],
            ],
            dtype=np.float32,
        )

        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

        assert max_diff < 1e-4