unet_new.py 14.9 KB
Newer Older
Patrick von Platen's avatar
Patrick von Platen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

# limitations under the License.
15
import torch
Patrick von Platen's avatar
Patrick von Platen committed
16
17
from torch import nn

18
19
from .attention import AttentionBlockNew
from .resnet import Downsample2D, ResnetBlock, Upsample2D
Patrick von Platen's avatar
Patrick von Platen committed
20
21


22
23
24
25
26
27
28
29
30
31
def get_down_block(
    down_block_type,
    num_layers,
    in_channels,
    out_channels,
    temb_channels,
    add_downsample,
    resnet_eps,
    resnet_act_fn,
    attn_num_head_channels,
Patrick von Platen's avatar
Patrick von Platen committed
32
    downsample_padding=None,
33
34
):
    if down_block_type == "UNetResDownBlock2D":
Patrick von Platen's avatar
Patrick von Platen committed
35
        return UNetResDownBlock2D(
36
37
38
39
40
41
42
            num_layers=num_layers,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
Patrick von Platen's avatar
Patrick von Platen committed
43
            downsample_padding=downsample_padding,
44
45
46
47
48
49
50
51
52
53
        )
    elif down_block_type == "UNetResAttnDownBlock2D":
        return UNetResAttnDownBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
54
            downsample_padding=downsample_padding,
55
56
57
58
59
60
61
62
            attn_num_head_channels=attn_num_head_channels,
        )


def get_up_block(
    up_block_type,
    num_layers,
    in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
63
64
    out_channels,
    prev_output_channel,
65
66
67
68
69
70
71
72
73
74
    temb_channels,
    add_upsample,
    resnet_eps,
    resnet_act_fn,
    attn_num_head_channels,
):
    if up_block_type == "UNetResUpBlock2D":
        return UNetResUpBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
75
76
            out_channels=out_channels,
            prev_output_channel=prev_output_channel,
77
78
79
80
81
82
83
84
85
            temb_channels=temb_channels,
            add_upsample=add_upsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
        )
    elif up_block_type == "UNetResAttnUpBlock2D":
        return UNetResAttnUpBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
Patrick von Platen's avatar
Patrick von Platen committed
86
87
            out_channels=out_channels,
            prev_output_channel=prev_output_channel,
88
89
90
91
92
93
94
95
            temb_channels=temb_channels,
            add_upsample=add_upsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
            attn_num_head_channels=attn_num_head_channels,
        )


Patrick von Platen's avatar
Patrick von Platen committed
96
97
98
99
100
class UNetMidBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        temb_channels: int,
101
        dropout: float = 0.0,
102
        num_layers: int = 1,
Patrick von Platen's avatar
Patrick von Platen committed
103
104
105
106
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
107
        resnet_pre_norm: bool = True,
108
        attn_num_head_channels=1,
Patrick von Platen's avatar
Patrick von Platen committed
109
        attention_type="default",
Patrick von Platen's avatar
Patrick von Platen committed
110
        output_scale_factor=1.0,
111
        **kwargs,
Patrick von Platen's avatar
Patrick von Platen committed
112
113
114
    ):
        super().__init__()

Patrick von Platen's avatar
Patrick von Platen committed
115
116
        self.attention_type = attention_type

117
118
        # there is always at least one resnet
        resnets = [
119
            ResnetBlock(
120
121
122
                in_channels=in_channels,
                out_channels=in_channels,
                temb_channels=temb_channels,
123
                eps=resnet_eps,
124
125
126
127
128
129
                groups=resnet_groups,
                dropout=dropout,
                time_embedding_norm=resnet_time_scale_shift,
                non_linearity=resnet_act_fn,
                output_scale_factor=output_scale_factor,
                pre_norm=resnet_pre_norm,
Patrick von Platen's avatar
Patrick von Platen committed
130
            )
131
132
        ]
        attentions = []
Patrick von Platen's avatar
Patrick von Platen committed
133

134
135
136
137
138
139
        for _ in range(num_layers):
            attentions.append(
                AttentionBlockNew(
                    in_channels,
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
140
                    eps=resnet_eps,
141
                )
142
            )
143
            resnets.append(
144
                ResnetBlock(
145
146
147
                    in_channels=in_channels,
                    out_channels=in_channels,
                    temb_channels=temb_channels,
148
                    eps=resnet_eps,
149
150
151
152
153
154
155
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
Patrick von Platen's avatar
Patrick von Platen committed
156
157
            )

158
159
160
        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

Patrick von Platen's avatar
Patrick von Platen committed
161
162
    def forward(self, hidden_states, temb=None, encoder_states=None):
        hidden_states = self.resnets[0](hidden_states, temb)
Patrick von Platen's avatar
Patrick von Platen committed
163

164
        for attn, resnet in zip(self.attentions, self.resnets[1:]):
Patrick von Platen's avatar
Patrick von Platen committed
165
166
            if self.attention_type == "default":
                hidden_states = attn(hidden_states)
167
            else:
Patrick von Platen's avatar
Patrick von Platen committed
168
169
                hidden_states = attn(hidden_states, encoder_states)
            hidden_states = resnet(hidden_states, temb)
Patrick von Platen's avatar
Patrick von Platen committed
170

171
        return hidden_states
Patrick von Platen's avatar
Patrick von Platen committed
172

173

174
175
176
177
178
179
180
181
182
183
184
185
186
187
class UNetResAttnDownBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        attn_num_head_channels=1,
Patrick von Platen's avatar
Patrick von Platen committed
188
        attention_type="default",
189
        output_scale_factor=1.0,
190
        downsample_padding=1,
191
192
193
194
195
196
        add_downsample=True,
    ):
        super().__init__()
        resnets = []
        attentions = []

Patrick von Platen's avatar
Patrick von Platen committed
197
198
        self.attention_type = attention_type

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else out_channels
            resnets.append(
                ResnetBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
            attentions.append(
                AttentionBlockNew(
                    out_channels,
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
220
                    eps=resnet_eps,
221
222
223
224
225
226
227
228
                )
            )

        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

        if add_downsample:
            self.downsamplers = nn.ModuleList(
229
230
231
232
233
                [
                    Downsample2D(
                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                    )
                ]
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
            )
        else:
            self.downsamplers = None

    def forward(self, hidden_states, temb=None):
        output_states = ()

        for resnet, attn in zip(self.resnets, self.attentions):
            hidden_states = resnet(hidden_states, temb)
            hidden_states = attn(hidden_states)
            output_states += (hidden_states,)

        if self.downsamplers is not None:
            for downsampler in self.downsamplers:
                hidden_states = downsampler(hidden_states)

            output_states += (hidden_states,)

        return hidden_states, output_states


class UNetResDownBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        output_scale_factor=1.0,
        add_downsample=True,
Patrick von Platen's avatar
Patrick von Platen committed
270
        downsample_padding=1,
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
    ):
        super().__init__()
        resnets = []

        for i in range(num_layers):
            in_channels = in_channels if i == 0 else out_channels
            resnets.append(
                ResnetBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )

        self.resnets = nn.ModuleList(resnets)

        if add_downsample:
            self.downsamplers = nn.ModuleList(
Patrick von Platen's avatar
Patrick von Platen committed
296
297
298
299
300
                [
                    Downsample2D(
                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                    )
                ]
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
            )
        else:
            self.downsamplers = None

    def forward(self, hidden_states, temb=None):
        output_states = ()

        for resnet in self.resnets:
            hidden_states = resnet(hidden_states, temb)
            output_states += (hidden_states,)

        if self.downsamplers is not None:
            for downsampler in self.downsamplers:
                hidden_states = downsampler(hidden_states)

            output_states += (hidden_states,)

        return hidden_states, output_states


class UNetResAttnUpBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
Patrick von Platen's avatar
Patrick von Platen committed
325
326
        prev_output_channel: int,
        out_channels: int,
327
328
329
330
331
332
333
334
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
Patrick von Platen's avatar
Patrick von Platen committed
335
        attention_type="default",
336
337
338
339
340
341
342
343
        attn_num_head_channels=1,
        output_scale_factor=1.0,
        add_upsample=True,
    ):
        super().__init__()
        resnets = []
        attentions = []

Patrick von Platen's avatar
Patrick von Platen committed
344
345
        self.attention_type = attention_type

346
        for i in range(num_layers):
Patrick von Platen's avatar
Patrick von Platen committed
347
348
349
            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
            resnet_in_channels = prev_output_channel if i == 0 else out_channels

350
351
            resnets.append(
                ResnetBlock(
Patrick von Platen's avatar
Patrick von Platen committed
352
353
                    in_channels=resnet_in_channels + res_skip_channels,
                    out_channels=out_channels,
354
355
356
357
358
359
360
361
362
363
364
365
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
            attentions.append(
                AttentionBlockNew(
Patrick von Platen's avatar
Patrick von Platen committed
366
                    out_channels,
367
368
                    num_head_channels=attn_num_head_channels,
                    rescale_output_factor=output_scale_factor,
Patrick von Platen's avatar
Patrick von Platen committed
369
                    eps=resnet_eps,
370
371
372
373
374
375
376
                )
            )

        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

        if add_upsample:
Patrick von Platen's avatar
Patrick von Platen committed
377
            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
        else:
            self.upsamplers = None

    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
        for resnet, attn in zip(self.resnets, self.attentions):

            # pop res hidden states
            res_hidden_states = res_hidden_states_tuple[-1]
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

            hidden_states = resnet(hidden_states, temb)
            hidden_states = attn(hidden_states)

        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
                hidden_states = upsampler(hidden_states)

        return hidden_states


class UNetResUpBlock2D(nn.Module):
    def __init__(
        self,
        in_channels: int,
Patrick von Platen's avatar
Patrick von Platen committed
403
404
        prev_output_channel: int,
        out_channels: int,
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        output_scale_factor=1.0,
        add_upsample=True,
    ):
        super().__init__()
        resnets = []

        for i in range(num_layers):
Patrick von Platen's avatar
Patrick von Platen committed
420
421
422
            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
            resnet_in_channels = prev_output_channel if i == 0 else out_channels

423
424
            resnets.append(
                ResnetBlock(
Patrick von Platen's avatar
Patrick von Platen committed
425
426
                    in_channels=resnet_in_channels + res_skip_channels,
                    out_channels=out_channels,
427
428
429
430
431
432
433
434
435
436
437
438
439
440
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )

        self.resnets = nn.ModuleList(resnets)

        if add_upsample:
Patrick von Platen's avatar
Patrick von Platen committed
441
            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
        else:
            self.upsamplers = None

    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
        for resnet in self.resnets:

            # pop res hidden states
            res_hidden_states = res_hidden_states_tuple[-1]
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

            hidden_states = resnet(hidden_states, temb)

        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
                hidden_states = upsampler(hidden_states)

        return hidden_states