utils.py 6.71 KB
Newer Older
helloyongyang's avatar
helloyongyang committed
1
import torch
helloyongyang's avatar
helloyongyang committed
2
import torch.distributed as dist
PengGao's avatar
PengGao committed
3

gushiqiao's avatar
gushiqiao committed
4
from lightx2v.utils.envs import *
helloyongyang's avatar
helloyongyang committed
5
6
7
8


def compute_freqs(c, grid_sizes, freqs):
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
helloyongyang's avatar
helloyongyang committed
9
    f, h, w = grid_sizes[0]
helloyongyang's avatar
helloyongyang committed
10
11
12
13
14
15
16
17
18
19
20
21
    seq_len = f * h * w
    freqs_i = torch.cat(
        [
            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
        ],
        dim=-1,
    ).reshape(seq_len, 1, -1)

    return freqs_i

22

gushiqiao's avatar
gushiqiao committed
23
24
def compute_freqs_audio(c, grid_sizes, freqs):
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
helloyongyang's avatar
helloyongyang committed
25
    f, h, w = grid_sizes[0]
wangshankun's avatar
wangshankun committed
26
    valid_token_length = f * h * w
gushiqiao's avatar
gushiqiao committed
27
28
29
30
    f = f + 1  ##for r2v add 1 channel
    seq_len = f * h * w
    freqs_i = torch.cat(
        [
31
32
33
            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),  # 时间(帧)编码
            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),  # 空间(高度)编码
            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),  # 空间(宽度)编码
gushiqiao's avatar
gushiqiao committed
34
35
36
37
        ],
        dim=-1,
    ).reshape(seq_len, 1, -1)

wangshankun's avatar
wangshankun committed
38
39
    freqs_i[valid_token_length:, :, :f] = 0  ###for r2v # zero temporl component corresponding to ref embeddings

gushiqiao's avatar
gushiqiao committed
40
41
42
    return freqs_i


helloyongyang's avatar
helloyongyang committed
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def compute_freqs_dist(s, c, grid_sizes, freqs, seq_p_group):
    world_size = dist.get_world_size(seq_p_group)
    cur_rank = dist.get_rank(seq_p_group)
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
    f, h, w = grid_sizes[0]
    seq_len = f * h * w
    freqs_i = torch.cat(
        [
            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
        ],
        dim=-1,
    ).reshape(seq_len, 1, -1)

    freqs_i = pad_freqs(freqs_i, s * world_size)
    s_per_rank = s
    freqs_i_rank = freqs_i[(cur_rank * s_per_rank) : ((cur_rank + 1) * s_per_rank), :, :]
    return freqs_i_rank


def compute_freqs_audio_dist(s, c, grid_sizes, freqs, seq_p_group):
    world_size = dist.get_world_size(seq_p_group)
    cur_rank = dist.get_rank(seq_p_group)
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
    f, h, w = grid_sizes[0]
    valid_token_length = f * h * w
    f = f + 1
    seq_len = f * h * w
    freqs_i = torch.cat(
        [
            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
        ],
        dim=-1,
    ).reshape(seq_len, 1, -1)

    freqs_i[valid_token_length:, :, :f] = 0

    freqs_i = pad_freqs(freqs_i, s * world_size)
    s_per_rank = s
    freqs_i_rank = freqs_i[(cur_rank * s_per_rank) : ((cur_rank + 1) * s_per_rank), :, :]
    return freqs_i_rank


Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
89
def compute_freqs_causvid(c, grid_sizes, freqs, start_frame=0):
90
    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
helloyongyang's avatar
helloyongyang committed
91
    f, h, w = grid_sizes[0]
92
93
94
95
96
97
98
99
100
101
102
103
    seq_len = f * h * w
    freqs_i = torch.cat(
        [
            freqs[0][start_frame : start_frame + f].view(f, 1, 1, -1).expand(f, h, w, -1),
            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
        ],
        dim=-1,
    ).reshape(seq_len, 1, -1)

    return freqs_i

helloyongyang's avatar
helloyongyang committed
104

Xinchi Huang's avatar
Xinchi Huang committed
105
106
107
def pad_freqs(original_tensor, target_len):
    seq_len, s1, s2 = original_tensor.shape
    pad_size = target_len - seq_len
Dongz's avatar
Dongz committed
108
    padding_tensor = torch.ones(pad_size, s1, s2, dtype=original_tensor.dtype, device=original_tensor.device)
Xinchi Huang's avatar
Xinchi Huang committed
109
110
111
112
    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
    return padded_tensor


helloyongyang's avatar
helloyongyang committed
113
114
115
116
def apply_rotary_emb(x, freqs_i):
    n = x.size(1)
    seq_len = freqs_i.size(0)

Dongz's avatar
Dongz committed
117
    x_i = torch.view_as_complex(x[:seq_len].to(torch.float64).reshape(seq_len, n, -1, 2))
helloyongyang's avatar
helloyongyang committed
118
119
    # Apply rotary embedding
    x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
gushiqiao's avatar
gushiqiao committed
120
    x_i = torch.cat([x_i, x[seq_len:]])
121
    return x_i.to(GET_DTYPE())
helloyongyang's avatar
helloyongyang committed
122
123


gushiqiao's avatar
gushiqiao committed
124
def apply_rotary_emb_chunk(x, freqs_i, chunk_size, remaining_chunk_size=100):
gushiqiao's avatar
gushiqiao committed
125
126
127
128
129
130
131
132
133
134
    n = x.size(1)
    seq_len = freqs_i.size(0)

    output_chunks = []
    for start in range(0, seq_len, chunk_size):
        end = min(start + chunk_size, seq_len)
        x_chunk = x[start:end]
        freqs_chunk = freqs_i[start:end]

        x_chunk_complex = torch.view_as_complex(x_chunk.to(torch.float32).reshape(end - start, n, -1, 2))
135
        x_chunk_embedded = torch.view_as_real(x_chunk_complex * freqs_chunk).flatten(2).to(GET_DTYPE())
gushiqiao's avatar
gushiqiao committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        output_chunks.append(x_chunk_embedded)
        del x_chunk_complex, x_chunk_embedded
        torch.cuda.empty_cache()

    result = []
    for chunk in output_chunks:
        result.append(chunk)
    del output_chunks
    torch.cuda.empty_cache()

    for start in range(seq_len, x.size(0), remaining_chunk_size):
        end = min(start + remaining_chunk_size, x.size(0))
        result.append(x[start:end])

    x_i = torch.cat(result, dim=0)
    del result
    torch.cuda.empty_cache()

154
    return x_i.to(GET_DTYPE())
gushiqiao's avatar
gushiqiao committed
155
156


helloyongyang's avatar
helloyongyang committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def rope_params(max_seq_len, dim, theta=10000):
    assert dim % 2 == 0
    freqs = torch.outer(
        torch.arange(max_seq_len),
        1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim)),
    )
    freqs = torch.polar(torch.ones_like(freqs), freqs)
    return freqs


def sinusoidal_embedding_1d(dim, position):
    # preprocess
    assert dim % 2 == 0
    half = dim // 2
    position = position.type(torch.float64)

    # calculation
Dongz's avatar
Dongz committed
174
    sinusoid = torch.outer(position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
gushiqiao's avatar
gushiqiao committed
175
    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
176
    x = x.to(GET_SENSITIVE_DTYPE())
gushiqiao's avatar
gushiqiao committed
177
    return x
178
179


GoatWu's avatar
GoatWu committed
180
def guidance_scale_embedding(w, embedding_dim=256, cfg_range=(1.0, 6.0), target_range=1000.0, dtype=torch.float32):
181
182
183
184
185
186
187
188
189
190
191
    """
    Args:
    timesteps: torch.Tensor: generate embedding vectors at these timesteps
    embedding_dim: int: dimension of the embeddings to generate
    dtype: data type of the generated embeddings

    Returns:
    embedding vectors with shape `(len(timesteps), embedding_dim)`
    """
    assert len(w.shape) == 1
    cfg_min, cfg_max = cfg_range
192
193
    w = torch.round(w)
    w = torch.clamp(w, min=cfg_min, max=cfg_max)
194
195
196
197
198
199
    w = (w - cfg_min) / (cfg_max - cfg_min)  # [0, 1]
    w = w * target_range
    half_dim = embedding_dim // 2
    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
    emb = torch.exp(torch.arange(half_dim, dtype=dtype).to(w.device) * -emb).to(w.device)
    emb = w.to(dtype)[:, None] * emb[None, :]
200
    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
201
202
203
204
    if embedding_dim % 2 == 1:  # zero pad
        emb = torch.nn.functional.pad(emb, (0, 1).to(w.device))
    assert emb.shape == (w.shape[0], embedding_dim)
    return emb