unet_new.py 14.7 KB
Newer Older
Patrick von Platen's avatar
Patrick von Platen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

# limitations under the License.
15
import torch
Patrick von Platen's avatar
Patrick von Platen committed
16
17
from torch import nn

18
19
from .attention import AttentionBlockNew
from .resnet import Downsample2D, ResnetBlock, Upsample2D
Patrick von Platen's avatar
Patrick von Platen committed
20
21


22
23
24
25
26
27
28
29
30
31
def get_down_block(
    down_block_type,
    num_layers,
    in_channels,
    out_channels,
    temb_channels,
    add_downsample,
    resnet_eps,
    resnet_act_fn,
    attn_num_head_channels,
Patrick von Platen's avatar
Patrick von Platen committed
32
    downsample_padding=None,
33
34
):
    if down_block_type == "UNetResDownBlock2D":
Patrick von Platen's avatar
Patrick von Platen committed
35
        return UNetResDownBlock2D(
36
37
38
39
40
41
42
            num_layers=num_layers,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
Patrick von Platen's avatar
Patrick von Platen committed
43
            downsample_padding=downsample_padding,
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
        )
    elif down_block_type == "UNetResAttnDownBlock2D":
        return UNetResAttnDownBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
            attn_num_head_channels=attn_num_head_channels,
        )


def get_up_block(
    up_block_type,
    num_layers,
    in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
62
63
    out_channels,
    prev_output_channel,
64
65
66
67
68
69
70
71
72
73
    temb_channels,
    add_upsample,
    resnet_eps,
    resnet_act_fn,
    attn_num_head_channels,
):
    if up_block_type == "UNetResUpBlock2D":
        return UNetResUpBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
74
75
            out_channels=out_channels,
            prev_output_channel=prev_output_channel,
76
77
78
79
80
81
82
83
84
            temb_channels=temb_channels,
            add_upsample=add_upsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
        )
    elif up_block_type == "UNetResAttnUpBlock2D":
        return UNetResAttnUpBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
85
86
            out_channels=out_channels,
            prev_output_channel=prev_output_channel,
87
88
89
90
91
92
93
94
            temb_channels=temb_channels,
            add_upsample=add_upsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
            attn_num_head_channels=attn_num_head_channels,
        )


Patrick von Platen's avatar
Patrick von Platen committed
95
96
97
98
99
class UNetMidBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        temb_channels: int,
100
        dropout: float = 0.0,
101
        num_layers: int = 1,
Patrick von Platen's avatar
Patrick von Platen committed
102
103
104
105
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
106
        resnet_pre_norm: bool = True,
107
        attn_num_head_channels=1,
Patrick von Platen's avatar
Patrick von Platen committed
108
        attention_type="default",
Patrick von Platen's avatar
Patrick von Platen committed
109
        output_scale_factor=1.0,
110
        **kwargs,
Patrick von Platen's avatar
Patrick von Platen committed
111
112
113
    ):
        super().__init__()

Patrick von Platen's avatar
Patrick von Platen committed
114
115
        self.attention_type = attention_type

116
117
        # there is always at least one resnet
        resnets = [
118
            ResnetBlock(
119
120
121
                in_channels=in_channels,
                out_channels=in_channels,
                temb_channels=temb_channels,
122
                eps=resnet_eps,
123
124
125
126
127
128
                groups=resnet_groups,
                dropout=dropout,
                time_embedding_norm=resnet_time_scale_shift,
                non_linearity=resnet_act_fn,
                output_scale_factor=output_scale_factor,
                pre_norm=resnet_pre_norm,
Patrick von Platen's avatar
Patrick von Platen committed
129
            )
130
131
        ]
        attentions = []
Patrick von Platen's avatar
Patrick von Platen committed
132

133
134
135
136
137
138
        for _ in range(num_layers):
            attentions.append(
                AttentionBlockNew(
                    in_channels,
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
139
                    eps=resnet_eps,
140
                )
141
            )
142
            resnets.append(
143
                ResnetBlock(
144
145
146
                    in_channels=in_channels,
                    out_channels=in_channels,
                    temb_channels=temb_channels,
147
                    eps=resnet_eps,
148
149
150
151
152
153
154
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
Patrick von Platen's avatar
Patrick von Platen committed
155
156
            )

157
158
159
        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

Patrick von Platen's avatar
Patrick von Platen committed
160
161
    def forward(self, hidden_states, temb=None, encoder_states=None):
        hidden_states = self.resnets[0](hidden_states, temb)
Patrick von Platen's avatar
Patrick von Platen committed
162

163
        for attn, resnet in zip(self.attentions, self.resnets[1:]):
Patrick von Platen's avatar
Patrick von Platen committed
164
165
            if self.attention_type == "default":
                hidden_states = attn(hidden_states)
166
            else:
Patrick von Platen's avatar
Patrick von Platen committed
167
168
                hidden_states = attn(hidden_states, encoder_states)
            hidden_states = resnet(hidden_states, temb)
Patrick von Platen's avatar
Patrick von Platen committed
169

170
        return hidden_states
Patrick von Platen's avatar
Patrick von Platen committed
171

172

173
174
175
176
177
178
179
180
181
182
183
184
185
186
class UNetResAttnDownBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        attn_num_head_channels=1,
Patrick von Platen's avatar
Patrick von Platen committed
187
        attention_type="default",
188
189
190
191
192
193
194
        output_scale_factor=1.0,
        add_downsample=True,
    ):
        super().__init__()
        resnets = []
        attentions = []

Patrick von Platen's avatar
Patrick von Platen committed
195
196
        self.attention_type = attention_type

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else out_channels
            resnets.append(
                ResnetBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
            attentions.append(
                AttentionBlockNew(
                    out_channels,
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
218
                    eps=resnet_eps,
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
                )
            )

        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

        if add_downsample:
            self.downsamplers = nn.ModuleList(
                [Downsample2D(in_channels, use_conv=True, out_channels=out_channels, padding=1, name="op")]
            )
        else:
            self.downsamplers = None

    def forward(self, hidden_states, temb=None):
        output_states = ()

        for resnet, attn in zip(self.resnets, self.attentions):
            hidden_states = resnet(hidden_states, temb)
            hidden_states = attn(hidden_states)
            output_states += (hidden_states,)

        if self.downsamplers is not None:
            for downsampler in self.downsamplers:
                hidden_states = downsampler(hidden_states)

            output_states += (hidden_states,)

        return hidden_states, output_states


class UNetResDownBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        output_scale_factor=1.0,
        add_downsample=True,
Patrick von Platen's avatar
Patrick von Platen committed
264
        downsample_padding=1,
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
    ):
        super().__init__()
        resnets = []

        for i in range(num_layers):
            in_channels = in_channels if i == 0 else out_channels
            resnets.append(
                ResnetBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )

        self.resnets = nn.ModuleList(resnets)

        if add_downsample:
            self.downsamplers = nn.ModuleList(
Patrick von Platen's avatar
Patrick von Platen committed
290
291
292
293
294
                [
                    Downsample2D(
                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                    )
                ]
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
            )
        else:
            self.downsamplers = None

    def forward(self, hidden_states, temb=None):
        output_states = ()

        for resnet in self.resnets:
            hidden_states = resnet(hidden_states, temb)
            output_states += (hidden_states,)

        if self.downsamplers is not None:
            for downsampler in self.downsamplers:
                hidden_states = downsampler(hidden_states)

            output_states += (hidden_states,)

        return hidden_states, output_states


class UNetResAttnUpBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
Patrick von Platen's avatar
Patrick von Platen committed
319
320
        prev_output_channel: int,
        out_channels: int,
321
322
323
324
325
326
327
328
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
Patrick von Platen's avatar
Patrick von Platen committed
329
        attention_type="default",
330
331
332
333
334
335
336
337
        attn_num_head_channels=1,
        output_scale_factor=1.0,
        add_upsample=True,
    ):
        super().__init__()
        resnets = []
        attentions = []

Patrick von Platen's avatar
Patrick von Platen committed
338
339
        self.attention_type = attention_type

340
        for i in range(num_layers):
Patrick von Platen's avatar
Patrick von Platen committed
341
342
343
            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
            resnet_in_channels = prev_output_channel if i == 0 else out_channels

344
345
            resnets.append(
                ResnetBlock(
Patrick von Platen's avatar
Patrick von Platen committed
346
347
                    in_channels=resnet_in_channels + res_skip_channels,
                    out_channels=out_channels,
348
349
350
351
352
353
354
355
356
357
358
359
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
            attentions.append(
                AttentionBlockNew(
Patrick von Platen's avatar
Patrick von Platen committed
360
                    out_channels,
361
362
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
363
                    eps=resnet_eps,
364
365
366
367
368
369
370
                )
            )

        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

        if add_upsample:
Patrick von Platen's avatar
Patrick von Platen committed
371
            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
        else:
            self.upsamplers = None

    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
        for resnet, attn in zip(self.resnets, self.attentions):

            # pop res hidden states
            res_hidden_states = res_hidden_states_tuple[-1]
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

            hidden_states = resnet(hidden_states, temb)
            hidden_states = attn(hidden_states)

        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
                hidden_states = upsampler(hidden_states)

        return hidden_states


class UNetResUpBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
Patrick von Platen's avatar
Patrick von Platen committed
397
398
        prev_output_channel: int,
        out_channels: int,
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        output_scale_factor=1.0,
        add_upsample=True,
    ):
        super().__init__()
        resnets = []

        for i in range(num_layers):
Patrick von Platen's avatar
Patrick von Platen committed
414
415
416
            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
            resnet_in_channels = prev_output_channel if i == 0 else out_channels

417
418
            resnets.append(
                ResnetBlock(
Patrick von Platen's avatar
Patrick von Platen committed
419
420
                    in_channels=resnet_in_channels + res_skip_channels,
                    out_channels=out_channels,
421
422
423
424
425
426
427
428
429
430
431
432
433
434
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )

        self.resnets = nn.ModuleList(resnets)

        if add_upsample:
Patrick von Platen's avatar
Patrick von Platen committed
435
            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
        else:
            self.upsamplers = None

    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
        for resnet in self.resnets:

            # pop res hidden states
            res_hidden_states = res_hidden_states_tuple[-1]
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

            hidden_states = resnet(hidden_states, temb)

        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
                hidden_states = upsampler(hidden_states)

        return hidden_states