Commit e2364931 authored by mashun1's avatar mashun1
Browse files

pixart-alpha

parents
Pipeline #861 canceled with stages
# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
# Please use this implementation in your products
# This implementation may produce slightly different results from Saining Xie's official implementations,
# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
# and in this way it works better for gradio's RGB protocol
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent.parent))
from torch import nn
import torch
import numpy as np
from torchvision import transforms as T
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import json
from PIL import Image
import torchvision.transforms.functional as TF
from accelerate import Accelerator
from diffusers.models import AutoencoderKL
import os
image_resize = 1024
class DoubleConvBlock(nn.Module):
def __init__(self, input_channel, output_channel, layer_number):
super().__init__()
self.convs = torch.nn.Sequential()
self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
for i in range(1, layer_number):
self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
def forward(self, x, down_sampling=False):
h = x
if down_sampling:
h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
for conv in self.convs:
h = conv(h)
h = torch.nn.functional.relu(h)
return h, self.projection(h)
class ControlNetHED_Apache2(nn.Module):
def __init__(self):
super().__init__()
self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
def forward(self, x):
h = x - self.norm
h, projection1 = self.block1(h)
h, projection2 = self.block2(h, down_sampling=True)
h, projection3 = self.block3(h, down_sampling=True)
h, projection4 = self.block4(h, down_sampling=True)
h, projection5 = self.block5(h, down_sampling=True)
return projection1, projection2, projection3, projection4, projection5
class InternData(Dataset):
def __init__(self):
####
with open('data/InternData/partition/data_info.json', 'r') as f:
self.j = json.load(f)
self.transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB')),
T.Resize(image_resize), # Image.BICUBIC
T.CenterCrop(image_resize),
T.ToTensor(),
])
def __len__(self):
return len(self.j)
def getdata(self, idx):
path = self.j[idx]['path']
image = Image.open("data/InternImgs/" + path)
image = self.transform(image)
return image, path
def __getitem__(self, idx):
for i in range(20):
try:
data = self.getdata(idx)
return data
except Exception as e:
print(f"Error details: {str(e)}")
idx = np.random.randint(len(self))
raise RuntimeError('Too many bad data.')
class HEDdetector(nn.Module):
def __init__(self, feature=True, vae=None):
super().__init__()
self.model = ControlNetHED_Apache2()
self.model.load_state_dict(torch.load('output/pretrained_models/ControlNetHED.pth', map_location='cpu'))
self.model.eval()
self.model.requires_grad_(False)
if feature:
if vae is None:
self.vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema")
else:
self.vae = vae
self.vae.eval()
self.vae.requires_grad_(False)
else:
self.vae = None
def forward(self, input_image):
B, C, H, W = input_image.shape
with torch.inference_mode():
edges = self.model(input_image * 255.)
edges = torch.cat([TF.resize(e, [H, W]) for e in edges], dim=1)
edge = 1 / (1 + torch.exp(-torch.mean(edges, dim=1, keepdim=True)))
edge.clip_(0, 1)
if self.vae:
edge = TF.normalize(edge, [.5], [.5])
edge = edge.repeat(1, 3, 1, 1)
posterior = self.vae.encode(edge).latent_dist
edge = torch.cat([posterior.mean, posterior.std], dim=1).cpu().numpy()
return edge
def main():
dataset = InternData()
dataloader = DataLoader(dataset, batch_size=10, shuffle=False, num_workers=8, pin_memory=True)
hed = HEDdetector()
accelerator = Accelerator()
hed, dataloader = accelerator.prepare(hed, dataloader)
for img, path in tqdm(dataloader):
out = hed(img.cuda())
for p, o in zip(path, out):
save = f'data/InternalData/hed_feature_{image_resize}/' + p.replace('.png', '.npz')
if os.path.exists(save):
continue
os.makedirs(os.path.dirname(save), exist_ok=True)
np.savez_compressed(save, o)
if __name__ == "__main__":
main()
from diffusion.model.llava.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
\ No newline at end of file
# Copyright 2023 Haotian Liu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Tuple, Union
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
from transformers import AutoConfig, AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from diffusion.model.llava.mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
class LlavaMPTConfig(MPTConfig):
model_type = "llava_mpt"
class LlavaMPTModel(MPTModel):
config_class = LlavaMPTConfig
def __init__(self, config: MPTConfig, mm_vision_tower=None, mm_hidden_size=None):
super(LlavaMPTModel, self).__init__(config)
if hasattr(config, "mm_vision_tower"):
# HACK: for FSDP
self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
# self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)
if hasattr(config, "use_mm_proj"):
self.mm_projector = nn.Linear(config.mm_hidden_size, config.d_model)
def initialize_vision_modules(self, vision_tower, mm_vision_select_layer,
pretrain_mm_mlp_adapter=None, tune_mm_mlp_adapter=False):
self.config.mm_vision_tower = vision_tower
image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
if not hasattr(self, 'vision_tower'):
vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
else:
vision_tower = self.vision_tower[0]
vision_tower.requires_grad_(False)
vision_tower = vision_tower.to(torch.float16)
self.vision_tower = [vision_tower]
vision_config = vision_tower.config
num_patches = (vision_config.image_size // vision_config.patch_size) ** 2
self.config.use_mm_proj = True
self.config.mm_hidden_size = vision_config.hidden_size
self.config.mm_vision_select_layer = mm_vision_select_layer
if not hasattr(self, 'mm_projector'):
self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.d_model)
if pretrain_mm_mlp_adapter is not None:
mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items() if 'mm_projector' in k})
return dict(
image_processor=image_processor,
image_token_len=num_patches,
vision_config=vision_config
)
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
# HACK: replace back original embeddings for LLaVA pretraining
orig_embeds_params = getattr(self, 'orig_embeds_params', None)
# if orig_embeds_params is not None:
# orig_embeds_params = orig_embeds_params[0]
# with torch.no_grad():
# self.get_input_embeddings().weight.data[:-2] = orig_embeds_params[:-2].data
inputs_embeds = self.wte(input_ids)
vision_tower = getattr(self, 'vision_tower', None)
if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
# TODO: this is a modified multimodal LLM -- Haotian Liu
vision_tower = vision_tower[0] # HACK: for FSDP
with torch.no_grad():
if type(images) is list:
# variable length images
image_features = []
for image in images:
image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True)
select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
image_features.append(image_feature)
else:
image_forward_outs = vision_tower(images, output_hidden_states=True)
select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
image_features = select_hidden_state[:, 1:]
if type(images) is list:
image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features]
else:
image_features = self.mm_projector(image_features)
dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
dummy_image_features = self.mm_projector(dummy_image_features)
new_input_embeds = []
cur_image_idx = 0
for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
# multimodal LLM, but the current sample is not multimodal
cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
new_input_embeds.append(cur_input_embeds)
continue
cur_image_features = image_features[cur_image_idx]
num_patches = cur_image_features.shape[0]
if vision_tower.config.use_im_start_end:
if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
raise ValueError("The number of image start tokens and image end tokens should be the same.")
image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0]
for image_start_token_pos in image_start_tokens:
cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device)
num_patches = cur_image_features.shape[0]
if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
raise ValueError("The image end token should follow the image start token.")
if orig_embeds_params is not None:
cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
else:
cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
cur_image_idx += 1
else:
if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
raise ValueError("The number of image patch tokens should be the same as the number of image patches.")
masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0]
mask_index_start = masked_indices[0]
if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
raise ValueError("The image patch tokens should be consecutive.")
if orig_embeds_params is not None:
cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
else:
cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
new_input_embeds.append(cur_new_input_embeds)
inputs_embeds = torch.stack(new_input_embeds, dim=0)
return super(LlavaMPTModel, self).forward(input_ids=None, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, tok_emb=inputs_embeds)
class LlavaMPTForCausalLM(MPTForCausalLM):
config_class = LlavaMPTConfig
supports_gradient_checkpointing = True
def __init__(self, config):
super(MPTForCausalLM, self).__init__(config)
if not config.tie_word_embeddings:
raise ValueError('MPTForCausalLM only supports tied word embeddings')
self.transformer = LlavaMPTModel(config)
self.logit_scale = None
if config.logit_scale is not None:
logit_scale = config.logit_scale
if isinstance(logit_scale, str):
if logit_scale == 'inv_sqrt_d_model':
logit_scale = 1 / math.sqrt(config.d_model)
else:
raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
self.logit_scale = logit_scale
def get_model(self):
return self.transformer
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, LlavaMPTModel):
module.gradient_checkpointing = value
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
return_dict = return_dict if return_dict is not None else self.config.return_dict
use_cache = use_cache if use_cache is not None else self.config.use_cache
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, images=images)
logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
if self.logit_scale is not None:
if self.logit_scale == 0:
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
logits *= self.logit_scale
loss = None
if labels is not None:
labels = torch.roll(labels, shifts=-1)
labels[:, -1] = -100
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
if inputs_embeds is not None:
raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
attention_mask = kwargs['attention_mask'].bool()
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
raise NotImplementedError('MPT does not support generation with right padding.')
if self.transformer.attn_uses_sequence_id and self.training:
sequence_id = torch.zeros_like(input_ids[:1])
else:
sequence_id = None
if past_key_values is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
if self.transformer.prefix_lm:
prefix_mask = torch.ones_like(attention_mask)
if kwargs.get('use_cache') == False:
raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
else:
prefix_mask = None
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)}
def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
tune_mm_mlp_adapter=False, pretrain_mm_mlp_adapter=None):
vision_config = self.get_model().vision_tower[0].config
vision_config.use_im_start_end = mm_use_im_start_end
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
self.resize_token_embeddings(len(tokenizer))
if mm_use_im_start_end:
num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
self.resize_token_embeddings(len(tokenizer))
vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
if num_new_tokens > 0:
input_embeddings = (
self._extracted_from_initialize_vision_tokenizer_14(
num_new_tokens
)
)
if tune_mm_mlp_adapter:
self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)]
for p in self.get_input_embeddings().parameters():
p.requires_grad = True
for p in self.get_output_embeddings().parameters():
p.requires_grad = False
if pretrain_mm_mlp_adapter:
mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
embed_tokens_weight = mm_projector_weights['transformer.wte.weight']
assert num_new_tokens == 2
if input_embeddings.shape == embed_tokens_weight.shape:
input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
elif embed_tokens_weight.shape[0] == num_new_tokens:
input_embeddings[-num_new_tokens:] = embed_tokens_weight
else:
raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
# TODO Rename this here and in `initialize_vision_tokenizer`
def _extracted_from_initialize_vision_tokenizer_14(self, num_new_tokens):
result = self.get_input_embeddings().weight.data
output_embeddings = self.get_output_embeddings().weight.data
input_embeddings_avg = result[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
dim=0, keepdim=True)
result[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
return result
AutoConfig.register("llava_mpt", LlavaMPTConfig)
AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)
"""Attention layers."""
import math
import warnings
from typing import Optional
import torch
import torch.nn as nn
from einops import rearrange
from torch import nn
from .norm import LPLayerNorm
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
if original_is_causal and num_query_tokens != num_key_tokens:
if num_query_tokens != 1:
raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
else:
return False
return original_is_causal
def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
min_val = torch.finfo(q.dtype).min
(b, _, s_q, d) = q.shape
s_k = k.size(-1)
if softmax_scale is None:
softmax_scale = 1 / math.sqrt(d)
attn_weight = q.matmul(k) * softmax_scale
if attn_bias is not None:
if attn_bias.size(-1) not in [1, s_k] or attn_bias.size(-2) not in [
1,
s_q,
]:
raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
attn_weight = attn_weight + attn_bias
if key_padding_mask is not None:
if attn_bias is not None:
warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
if is_causal:
s = max(s_q, s_k)
causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
causal_mask = causal_mask.tril()
causal_mask = causal_mask.to(torch.bool)
causal_mask = ~causal_mask
causal_mask = causal_mask[-s_q:, -s_k:]
attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
attn_weight = torch.softmax(attn_weight, dim=-1)
if dropout_p:
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
out = attn_weight.matmul(v)
out = rearrange(out, 'b h s d -> b s (h d)')
return (out, attn_weight) if needs_weights else (out, None)
def check_valid_inputs(*tensors, valid_dtypes=None):
if valid_dtypes is None:
valid_dtypes = [torch.float16, torch.bfloat16]
for tensor in tensors:
if tensor.dtype not in valid_dtypes:
raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
if not tensor.is_cuda:
raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
try:
from flash_attn import bert_padding, flash_attn_interface
except:
raise RuntimeError('Please install flash-attn==1.0.3.post0')
check_valid_inputs(query, key, value)
if attn_bias is not None:
raise NotImplementedError('attn_bias not implemented for flash attn.')
(batch_size, seqlen) = query.shape[:2]
if key_padding_mask is None:
key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
query_padding_mask = key_padding_mask[:, -query.size(1):]
(query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
(key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
(value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
if multiquery:
key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
dropout_p = dropout_p if training else 0.0
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
return (output, None)
def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
try:
from flash_attn import flash_attn_triton
except:
raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
check_valid_inputs(query, key, value)
if dropout_p:
raise NotImplementedError('Dropout not implemented for attn_impl: triton.')
if needs_weights:
raise NotImplementedError('attn_impl: triton cannot return attn weights.')
if key_padding_mask is not None:
warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
(b_size, s_k) = key_padding_mask.shape[:2]
if attn_bias is None:
attn_bias = query.new_zeros(b_size, 1, 1, s_k)
attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
if multiquery:
key = key.expand(*key.shape[:2], n_heads, key.size(-1))
value = value.expand(*value.shape[:2], n_heads, value.size(-1))
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
output = attn_output.view(*attn_output.shape[:2], -1)
return (output, None)
class MultiheadAttention(nn.Module):
"""Multi-head self attention.
Using torch or triton attention implemetation enables user to also use
additive bias.
"""
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
super().__init__()
self.attn_impl = attn_impl
self.clip_qkv = clip_qkv
self.qk_ln = qk_ln
self.d_model = d_model
self.n_heads = n_heads
self.softmax_scale = softmax_scale
if self.softmax_scale is None:
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
self.attn_dropout_p = attn_pdrop
self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
fuse_splits = (d_model, 2 * d_model)
self.Wqkv._fused = (0, fuse_splits)
if self.qk_ln:
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
self.q_ln = layernorm_class(self.d_model, device=device)
self.k_ln = layernorm_class(self.d_model, device=device)
if self.attn_impl == 'flash':
self.attn_fn = flash_attn_fn
elif self.attn_impl == 'triton':
self.attn_fn = triton_flash_attn_fn
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
elif self.attn_impl == 'torch':
self.attn_fn = scaled_multihead_dot_product_attention
if torch.cuda.is_available():
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
else:
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
self.out_proj._is_residual = True
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
qkv = self.Wqkv(x)
if self.clip_qkv:
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
(query, key, value) = qkv.chunk(3, dim=2)
key_padding_mask = attention_mask
if self.qk_ln:
dtype = query.dtype
query = self.q_ln(query).to(dtype)
key = self.k_ln(key).to(dtype)
if past_key_value is not None:
if len(past_key_value) != 0:
key = torch.cat([past_key_value[0], key], dim=1)
value = torch.cat([past_key_value[1], value], dim=1)
past_key_value = (key, value)
if attn_bias is not None:
attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
(context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
return (self.out_proj(context), attn_weights, past_key_value)
class MultiQueryAttention(nn.Module):
"""Multi-Query self attention.
Using torch or triton attention implemetation enables user to also use
additive bias.
"""
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
super().__init__()
self.attn_impl = attn_impl
self.clip_qkv = clip_qkv
self.qk_ln = qk_ln
self.d_model = d_model
self.n_heads = n_heads
self.head_dim = d_model // n_heads
self.softmax_scale = softmax_scale
if self.softmax_scale is None:
self.softmax_scale = 1 / math.sqrt(self.head_dim)
self.attn_dropout_p = attn_pdrop
self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
fuse_splits = (d_model, d_model + self.head_dim)
self.Wqkv._fused = (0, fuse_splits)
if self.qk_ln:
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
self.q_ln = layernorm_class(d_model, device=device)
self.k_ln = layernorm_class(self.head_dim, device=device)
if self.attn_impl == 'flash':
self.attn_fn = flash_attn_fn
elif self.attn_impl == 'triton':
self.attn_fn = triton_flash_attn_fn
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
elif self.attn_impl == 'torch':
self.attn_fn = scaled_multihead_dot_product_attention
if torch.cuda.is_available():
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
else:
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
self.out_proj._is_residual = True
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
qkv = self.Wqkv(x)
if self.clip_qkv:
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
key_padding_mask = attention_mask
if self.qk_ln:
dtype = query.dtype
query = self.q_ln(query).to(dtype)
key = self.k_ln(key).to(dtype)
if past_key_value is not None:
if len(past_key_value) != 0:
key = torch.cat([past_key_value[0], key], dim=1)
value = torch.cat([past_key_value[1], value], dim=1)
past_key_value = (key, value)
if attn_bias is not None:
attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
(context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
return (self.out_proj(context), attn_weights, past_key_value)
def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
if attn_impl == 'flash':
return None
elif attn_impl in ['torch', 'triton']:
if alibi:
if (prefix_lm or not causal) or use_sequence_id:
return (1, n_heads, seq_len, seq_len)
return (1, n_heads, 1, seq_len)
elif prefix_lm or use_sequence_id:
return (1, 1, seq_len, seq_len)
return None
else:
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
if attn_impl == 'flash':
return None
elif attn_impl in ['torch', 'triton']:
if alibi:
(device, dtype) = (attn_bias.device, attn_bias.dtype)
attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
return attn_bias
else:
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
def gen_slopes(n_heads, alibi_bias_max=8, device=None):
_n_heads = 2 ** math.ceil(math.log2(n_heads))
m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
m = m.mul(alibi_bias_max / _n_heads)
slopes = 1.0 / torch.pow(2, m)
if _n_heads != n_heads:
slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
return slopes.view(1, n_heads, 1, 1)
def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
if full:
alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
alibi_bias = alibi_bias.abs().mul(-1)
slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
alibi_bias = alibi_bias * slopes
return alibi_bias.to(dtype=dtype)
ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
\ No newline at end of file
"""GPT Blocks used for the GPT Model."""
from typing import Dict, Optional, Tuple
import torch
import torch.nn as nn
from .attention import ATTN_CLASS_REGISTRY
from .norm import NORM_CLASS_REGISTRY
class MPTMLP(nn.Module):
def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
super().__init__()
self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
self.act = nn.GELU(approximate='none')
self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
self.down_proj._is_residual = True
def forward(self, x):
return self.down_proj(self.act(self.up_proj(x)))
class MPTBlock(nn.Module):
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict = None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
if attn_config is None:
attn_config = {
'attn_type': 'multihead_attention',
'attn_pdrop': 0.0,
'attn_impl': 'triton',
'qk_ln': False,
'clip_qkv': None,
'softmax_scale': None,
'prefix_lm': False,
'attn_uses_sequence_id': False,
'alibi': False,
'alibi_bias_max': 8,
}
del kwargs
super().__init__()
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
self.norm_1 = norm_class(d_model, device=device)
self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device)
self.norm_2 = norm_class(d_model, device=device)
self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
a = self.norm_1(x)
(b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
x = x + self.resid_attn_dropout(b)
m = self.norm_2(x)
n = self.ffn(m)
x = x + self.resid_ffn_dropout(n)
return (x, past_key_value)
\ No newline at end of file
"""A HuggingFace-style model configuration."""
from typing import Dict, Optional, Union
from transformers import PretrainedConfig
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}
class MPTConfig(PretrainedConfig):
model_type = 'mpt'
def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
"""The MPT configuration class.
Args:
d_model (int): The size of the embedding dimension of the model.
n_heads (int): The number of attention heads.
n_layers (int): The number of layers in the model.
expansion_ratio (int): The ratio of the up/down scale in the MLP.
max_seq_len (int): The maximum sequence length of the model.
vocab_size (int): The size of the vocabulary.
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
emb_pdrop (float): The dropout probability for the embedding layer.
learned_pos_emb (bool): Whether to use learned positional embeddings
attn_config (Dict): A dictionary used to configure the model's attention module:
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
attn_pdrop (float): The dropout probability for the attention layers.
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
this value.
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
use the default scale of ``1/sqrt(d_keys)``.
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
which sub-sequence each token belongs to.
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
alibi (bool): Whether to use the alibi bias instead of position embeddings.
alibi_bias_max (int): The maximum value of the alibi bias.
init_device (str): The device to use for parameter initialization.
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
no_bias (bool): Whether to use bias in all layers.
verbose (int): The verbosity level. 0 is silent.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
multiquery_attention (bool): Whether to use multiquery attention implementation.
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
init_std (float): The standard deviation of the normal distribution used to initialize the model,
if using the baseline_ parameter initialization scheme.
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
---
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
"""
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.expansion_ratio = expansion_ratio
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.resid_pdrop = resid_pdrop
self.emb_pdrop = emb_pdrop
self.learned_pos_emb = learned_pos_emb
self.attn_config = attn_config
self.init_device = init_device
self.logit_scale = logit_scale
self.no_bias = no_bias
self.verbose = verbose
self.embedding_fraction = embedding_fraction
self.norm_type = norm_type
self.use_cache = use_cache
self.init_config = init_config
if 'name' in kwargs:
del kwargs['name']
if 'loss_fn' in kwargs:
del kwargs['loss_fn']
super().__init__(**kwargs)
self._validate_config()
def _set_config_defaults(self, config, config_defaults):
for (k, v) in config_defaults.items():
if k not in config:
config[k] = v
return config
def _validate_config(self):
self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
if self.d_model % self.n_heads != 0:
raise ValueError('d_model must be divisible by n_heads')
if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError('alibi only implemented with torch and triton attention.')
if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
if self.init_config.get('name', None) is None:
raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
if not self.learned_pos_emb and (not self.attn_config['alibi']):
raise ValueError(
'Positional information must be provided to the model using either learned_pos_emb or alibi.'
)
\ No newline at end of file
"""A simple, flexible implementation of a GPT model.
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
"""
import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from .attention import attn_bias_shape, build_attn_bias
from .blocks import MPTBlock
from .norm import NORM_CLASS_REGISTRY
from .configuration_mpt import MPTConfig
from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
from transformers.utils import logging
logger = logging.get_logger(__name__)
class MPTPreTrainedModel(PreTrainedModel):
config_class = MPTConfig
base_model_prefix = 'model'
class MPTModel(MPTPreTrainedModel):
def __init__(self, config: MPTConfig):
config._validate_config()
super().__init__(config)
self.attn_impl = config.attn_config['attn_impl']
self.prefix_lm = config.attn_config['prefix_lm']
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
self.alibi = config.attn_config['alibi']
self.alibi_bias_max = config.attn_config['alibi_bias_max']
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
self.embedding_fraction = config.embedding_fraction
self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
if not self.alibi:
self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
self.emb_drop = nn.Dropout(config.emb_pdrop)
self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
self.norm_f = norm_class(config.d_model, device=config.init_device)
if config.init_device != 'meta':
self.apply(self.param_init_fn)
self.is_causal = not self.prefix_lm
self._attn_bias_initialized = False
self.attn_bias = None
self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
if config.no_bias:
for module in self.modules():
if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
if config.verbose:
warnings.warn(f'Removing bias ({module.bias}) from {module}.')
module.register_parameter('bias', None)
if config.verbose and config.verbose > 2:
print(self)
if 'verbose' not in self.config.init_config:
self.config.init_config['verbose'] = self.config.verbose
if self.config.init_config['verbose'] > 1:
init_fn_name = self.config.init_config['name']
warnings.warn(f'Using {init_fn_name} initialization.')
self.gradient_checkpointing = False
def get_input_embeddings(self):
return self.wte
def set_input_embeddings(self, value):
self.wte = value
@torch.no_grad()
def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
if not self._attn_bias_initialized:
if self.attn_bias_shape:
self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
self._attn_bias_initialized = True
if self.attn_impl == 'flash':
return (self.attn_bias, attention_mask)
if self.attn_bias is not None:
self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
attn_bias = self.attn_bias
if self.prefix_lm:
assert isinstance(attn_bias, torch.Tensor)
assert isinstance(prefix_mask, torch.Tensor)
attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
if self.attn_uses_sequence_id and sequence_id is not None:
assert isinstance(attn_bias, torch.Tensor)
attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
if attention_mask is not None:
s_k = attention_mask.shape[-1]
if attn_bias is None:
attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
else:
attn_bias = attn_bias[:, :, :, -s_k:]
if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
min_val = torch.finfo(attn_bias.dtype).min
attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
return (attn_bias, None)
def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
(s_k, s_q) = attn_bias.shape[-2:]
if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
raise ValueError(
f'attn_bias does not match the expected shape. The last two dimensions should both be {self.config.max_length} '
+ f'but are {s_k} and {s_q}.'
)
seq_len = prefix_mask.shape[-1]
if seq_len > self.config.max_seq_len:
raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
attn_bias = attn_bias[..., :seq_len, :seq_len]
causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
prefix = prefix_mask.view(-1, 1, 1, seq_len)
cannot_attend = ~torch.logical_or(causal, prefix.bool())
return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)
def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
seq_len = sequence_id.shape[-1]
if seq_len > self.config.max_seq_len:
raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
attn_bias = attn_bias[..., :seq_len, :seq_len]
cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)
# TODO Rename this here and in `_apply_prefix_mask` and `_apply_sequence_id`
def _extracted_from__apply_sequence_id_15(self, attn_bias, cannot_attend):
min_val = torch.finfo(attn_bias.dtype).min
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
return attn_bias
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, tok_emb: Optional[torch.FloatTensor]=None):
return_dict = return_dict if return_dict is not None else self.config.return_dict
use_cache = use_cache if use_cache is not None else self.config.use_cache
if self.gradient_checkpointing and self.training and use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
if attention_mask is not None:
attention_mask = attention_mask.bool()
if prefix_mask is not None:
prefix_mask = prefix_mask.bool()
if not return_dict:
raise NotImplementedError('return_dict False is not implemented yet for MPT')
if output_attentions:
raise NotImplementedError('output_attentions is not implemented yet for MPT')
if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
raise NotImplementedError('MPT does not support training with left padding.')
if self.prefix_lm and prefix_mask is None:
raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
if self.training:
if self.attn_uses_sequence_id and sequence_id is None:
raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
elif self.attn_uses_sequence_id is False and sequence_id is not None:
warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
if input_ids is not None:
S = input_ids.size(1)
assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
tok_emb = self.wte(input_ids)
else:
assert tok_emb is not None
S = tok_emb.size(1)
if self.alibi:
x = tok_emb
else:
past_position = 0
if past_key_values is not None:
if len(past_key_values) != self.config.n_layers:
raise ValueError(
f'past_key_values must provide a past_key_value for each attention layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).'
)
past_position = past_key_values[0][0].size(1)
if S + past_position > self.config.max_seq_len:
raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
if attention_mask is not None:
pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
pos_emb = self.wpe(pos)
x = tok_emb + pos_emb
if self.embedding_fraction == 1:
x = self.emb_drop(x)
else:
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
assert isinstance(self.emb_drop, nn.Module)
x = self.emb_drop(x_shrunk)
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
if use_cache and past_key_values is None:
past_key_values = [() for _ in range(self.config.n_layers)]
all_hidden_states = () if output_hidden_states else None
for (b_idx, block) in enumerate(self.blocks):
if output_hidden_states:
assert all_hidden_states is not None
all_hidden_states = all_hidden_states + (x,)
past_key_value = past_key_values[b_idx] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
(x, past_key_value) = torch.utils.checkpoint.checkpoint(
block,
x, past_key_value, attn_bias, attention_mask, self.is_causal
)
else:
(x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
if past_key_values is not None:
past_key_values[b_idx] = past_key_value
x = self.norm_f(x)
return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
def param_init_fn(self, module):
init_fn_name = self.config.init_config['name']
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
def fsdp_wrap_fn(self, module):
return isinstance(module, MPTBlock)
def activation_checkpointing_fn(self, module):
return isinstance(module, MPTBlock)
class MPTForCausalLM(MPTPreTrainedModel):
def __init__(self, config: MPTConfig):
super().__init__(config)
if not config.tie_word_embeddings:
raise ValueError('MPTForCausalLM only supports tied word embeddings')
self.transformer = MPTModel(config)
self.logit_scale = None
if config.logit_scale is not None:
logit_scale = config.logit_scale
if isinstance(logit_scale, str):
if logit_scale == 'inv_sqrt_d_model':
logit_scale = 1 / math.sqrt(config.d_model)
else:
raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
self.logit_scale = logit_scale
def get_input_embeddings(self):
return self.transformer.wte
def set_input_embeddings(self, value):
self.transformer.wte = value
def get_output_embeddings(self):
return self.transformer.wte
def set_output_embeddings(self, new_embeddings):
self.transformer.wte = new_embeddings
def set_decoder(self, decoder):
self.transformer = decoder
def get_decoder(self):
return self.transformer
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
return_dict = return_dict if return_dict is not None else self.config.return_dict
use_cache = use_cache if use_cache is not None else self.config.use_cache
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
if self.logit_scale is not None:
if self.logit_scale == 0:
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
logits *= self.logit_scale
loss = None
if labels is not None:
labels = torch.roll(labels, shifts=-1)
labels[:, -1] = -100
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
def param_init_fn(self, module):
init_fn_name = self.config.init_config['name']
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
def fsdp_wrap_fn(self, module):
return isinstance(module, MPTBlock)
def activation_checkpointing_fn(self, module):
return isinstance(module, MPTBlock)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
if inputs_embeds is not None:
raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
attention_mask = kwargs['attention_mask'].bool()
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
raise NotImplementedError('MPT does not support generation with right padding.')
if self.transformer.attn_uses_sequence_id and self.training:
sequence_id = torch.zeros_like(input_ids[:1])
else:
sequence_id = None
if past_key_values is not None:
input_ids = input_ids[:, -1].unsqueeze(-1)
if self.transformer.prefix_lm:
prefix_mask = torch.ones_like(attention_mask)
if kwargs.get('use_cache') == False:
raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
else:
prefix_mask = None
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
"""Used by HuggingFace generate when using beam search with kv-caching.
See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
for an example in transformers.
"""
return [
tuple(
(past_state.index_select(0, beam_idx) for past_state in layer_past)
)
for layer_past in past_key_values
]
\ No newline at end of file
import torch
def _cast_if_autocast_enabled(tensor):
if torch.is_autocast_enabled():
if tensor.device.type == 'cuda':
dtype = torch.get_autocast_gpu_dtype()
elif tensor.device.type == 'cpu':
dtype = torch.get_autocast_cpu_dtype()
else:
raise NotImplementedError()
return tensor.to(dtype=dtype)
return tensor
class LPLayerNorm(torch.nn.LayerNorm):
def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
def forward(self, x):
module_device = x.device
downcast_x = _cast_if_autocast_enabled(x)
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
with torch.autocast(enabled=False, device_type=module_device.type):
return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
def rms_norm(x, weight=None, eps=1e-05):
output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
return output * weight if weight is not None else output
class RMSNorm(torch.nn.Module):
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
super().__init__()
self.eps = eps
if weight:
self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
else:
self.register_parameter('weight', None)
def forward(self, x):
return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
class LPRMSNorm(RMSNorm):
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
def forward(self, x):
downcast_x = _cast_if_autocast_enabled(x)
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
with torch.autocast(enabled=False, device_type=x.device.type):
return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
\ No newline at end of file
import math
import warnings
from collections.abc import Sequence
from functools import partial
from typing import Optional, Tuple, Union
import torch
from torch import nn
from .norm import NORM_CLASS_REGISTRY
def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
del kwargs
if verbose > 1:
warnings.warn("Initializing network using module's reset_parameters attribute")
if hasattr(module, 'reset_parameters'):
module.reset_parameters()
def fused_init_helper_(module: nn.Module, init_fn_):
_fused = getattr(module, '_fused', None)
if _fused is None:
raise RuntimeError('Internal logic error')
(dim, splits) = _fused
splits = (0, *splits, module.weight.size(dim))
for (s, e) in zip(splits[:-1], splits[1:]):
slice_indices = [slice(None)] * module.weight.ndim
slice_indices[dim] = slice(s, e)
init_fn_(module.weight[slice_indices])
def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
del kwargs
if verbose > 1:
warnings.warn('If model has bias parameters they are initialized to 0.')
init_div_is_residual = init_div_is_residual
if init_div_is_residual is False:
div_is_residual = 1.0
elif init_div_is_residual is True:
div_is_residual = math.sqrt(2 * n_layers)
elif isinstance(init_div_is_residual, (float, int)):
div_is_residual = init_div_is_residual
elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
div_is_residual = float(init_div_is_residual)
else:
div_is_residual = 1.0
raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
if init_div_is_residual is not False and verbose > 1:
warnings.warn(
f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. Set `init_div_is_residual: false` in init config to disable this.'
)
if isinstance(module, nn.Linear):
if hasattr(module, '_fused'):
fused_init_helper_(module, init_fn_)
else:
init_fn_(module.weight)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
if init_div_is_residual is not False and getattr(module, '_is_residual', False):
with torch.no_grad():
module.weight.div_(div_is_residual)
elif isinstance(module, nn.Embedding):
if emb_init_std is not None:
std = emb_init_std
if std == 0:
warnings.warn('Embedding layer initialized to 0.')
emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
if verbose > 1:
warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
elif emb_init_uniform_lim is not None:
lim = emb_init_uniform_lim
if isinstance(lim, Sequence):
if len(lim) > 2:
raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
if lim[0] == lim[1]:
warnings.warn(f'Embedding layer initialized to {lim[0]}.')
else:
if lim == 0:
warnings.warn('Embedding layer initialized to 0.')
lim = [-lim, lim]
(a, b) = lim
emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
if verbose > 1:
warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
else:
emb_init_fn_ = init_fn_
emb_init_fn_(module.weight)
elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
if verbose > 1:
warnings.warn(
'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.'
)
if hasattr(module, 'weight') and module.weight is not None:
torch.nn.init.ones_(module.weight)
if hasattr(module, 'bias') and module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.MultiheadAttention):
if module._qkv_same_embed_dim:
_extracted_from_generic_param_init_fn__69(module, d_model, init_fn_)
else:
assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
assert module.in_proj_weight is None
init_fn_(module.q_proj_weight)
init_fn_(module.k_proj_weight)
init_fn_(module.v_proj_weight)
if module.in_proj_bias is not None:
torch.nn.init.zeros_(module.in_proj_bias)
if module.bias_k is not None:
torch.nn.init.zeros_(module.bias_k)
if module.bias_v is not None:
torch.nn.init.zeros_(module.bias_v)
init_fn_(module.out_proj.weight)
if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
with torch.no_grad():
module.out_proj.weight.div_(div_is_residual)
if module.out_proj.bias is not None:
torch.nn.init.zeros_(module.out_proj.bias)
else:
for _ in module.parameters(recurse=False):
raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
# TODO Rename this here and in `generic_param_init_fn_`
def _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_):
assert module.in_proj_weight is not None
assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
assert d_model is not None
_d = d_model
splits = (0, _d, 2 * _d, 3 * _d)
for (s, e) in zip(splits[:-1], splits[1:]):
init_fn_(module.in_proj_weight[s:e])
def _normal_init_(std, mean=0.0):
return partial(torch.nn.init.normal_, mean=mean, std=std)
def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
del kwargs
init_fn_ = _normal_init_(std=std)
if verbose > 1:
warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
del kwargs
if init_std is None:
raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
_normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
del kwargs
std = math.sqrt(2 / (5 * d_model))
_normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
"""From section 2.3.1 of GPT-NeoX-20B:
An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
"""
del kwargs
residual_div = n_layers / math.sqrt(10)
if verbose > 1:
warnings.warn(f'setting init_div_is_residual to {residual_div}')
small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
del kwargs
if verbose > 1:
warnings.warn(
f'Using nn.init.kaiming_uniform_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
)
kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
del kwargs
if verbose > 1:
warnings.warn(
f'Using nn.init.kaiming_normal_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
)
kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
del kwargs
xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
if verbose > 1:
warnings.warn(
f'Using torch.nn.init.xavier_uniform_ init fn with parameters: gain={init_gain}'
)
generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
if verbose > 1:
warnings.warn(
f'Using torch.nn.init.xavier_normal_ init fn with parameters: gain={init_gain}'
)
generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
\ No newline at end of file
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
import torch
import torch.nn as nn
import os
import numpy as np
from timm.models.layers import DropPath
from timm.models.vision_transformer import PatchEmbed, Mlp
from diffusion.model.builder import MODELS
from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, LabelEmbedder, FinalLayer
from diffusion.utils.logger import get_root_logger
class PixArtBlock(nn.Module):
"""
A PixArt block with adaptive layer norm (adaLN-single) conditioning.
"""
def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
super().__init__()
self.hidden_size = hidden_size
self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
input_size=input_size if window_size == 0 else (window_size, window_size),
use_rel_pos=use_rel_pos, **block_kwargs)
self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
# to be compatible with lower version pytorch
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.window_size = window_size
self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
def forward(self, x, y, t, mask=None, **kwargs):
B, N, C = x.shape
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
x = x + self.cross_attn(x, y, mask)
x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
return x
#############################################################################
# Core PixArt Model #
#################################################################################
@MODELS.register_module()
class PixArt(nn.Module):
"""
Diffusion model with a Transformer backbone.
"""
def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=120, **kwargs):
if window_block_indexes is None:
window_block_indexes = []
super().__init__()
self.pred_sigma = pred_sigma
self.in_channels = in_channels
self.out_channels = in_channels * 2 if pred_sigma else in_channels
self.patch_size = patch_size
self.num_heads = num_heads
self.lewei_scale = lewei_scale,
self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
self.t_embedder = TimestepEmbedder(hidden_size)
num_patches = self.x_embedder.num_patches
self.base_size = input_size // self.patch_size
# Will use fixed sin-cos embedding:
self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.t_block = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 6 * hidden_size, bias=True)
)
self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
PixArtBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
input_size=(input_size // patch_size, input_size // patch_size),
window_size=window_size if i in window_block_indexes else 0,
use_rel_pos=use_rel_pos if i in window_block_indexes else False)
for i in range(depth)
])
self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)
self.initialize_weights()
if config:
logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
logger.warning(f"lewei scale: {self.lewei_scale}, base size: {self.base_size}")
else:
print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}')
def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs):
"""
Forward pass of PixArt.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N, 1, 120, C) tensor of class labels
"""
x = x.to(self.dtype)
timestep = timestep.to(self.dtype)
y = y.to(self.dtype)
pos_embed = self.pos_embed.to(self.dtype)
self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2
t = self.t_embedder(timestep.to(x.dtype)) # (N, D)
t0 = self.t_block(t)
y = self.y_embedder(y, self.training) # (N, 1, L, D)
if mask is not None:
if mask.shape[0] != y.shape[0]:
mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
mask = mask.squeeze(1).squeeze(1)
y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
y_lens = mask.sum(dim=1).tolist()
else:
y_lens = [y.shape[2]] * y.shape[0]
y = y.squeeze(1).view(1, -1, x.shape[-1])
for block in self.blocks:
x = auto_grad_checkpoint(block, x, y, t0, y_lens) # (N, T, D) #support grad checkpoint
x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
def forward_with_dpmsolver(self, x, timestep, y, mask=None, **kwargs):
"""
dpm solver donnot need variance prediction
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
model_out = self.forward(x, timestep, y, mask)
return model_out.chunk(2, dim=1)[0]
def forward_with_cfg(self, x, timestep, y, cfg_scale, mask=None, **kwargs):
"""
Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
half = x[: len(x) // 2]
combined = torch.cat([half, half], dim=0)
model_out = self.forward(combined, timestep, y, mask, kwargs)
model_out = model_out['x'] if isinstance(model_out, dict) else model_out
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
eps = torch.cat([half_eps, half_eps], dim=0)
return torch.cat([eps, rest], dim=1)
def unpatchify(self, x):
"""
x: (N, T, patch_size**2 * C)
imgs: (N, H, W, C)
"""
c = self.out_channels
p = self.x_embedder.patch_size[0]
h = w = int(x.shape[1] ** 0.5)
assert h * w == x.shape[1]
x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
x = torch.einsum('nhwpqc->nchpwq', x)
return x.reshape(shape=(x.shape[0], c, h * p, h * p))
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize (and freeze) pos_embed by sin-cos embedding:
pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size)
self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
w = self.x_embedder.proj.weight.data
nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
# Initialize timestep embedding MLP:
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.t_block[1].weight, std=0.02)
# Initialize caption embedding MLP:
nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
# Zero-out adaLN modulation layers in PixArt blocks:
for block in self.blocks:
nn.init.constant_(block.cross_attn.proj.weight, 0)
nn.init.constant_(block.cross_attn.proj.bias, 0)
# Zero-out output layers:
nn.init.constant_(self.final_layer.linear.weight, 0)
nn.init.constant_(self.final_layer.linear.bias, 0)
@property
def dtype(self):
return next(self.parameters()).dtype
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
if isinstance(grid_size, int):
grid_size = to_2tuple(grid_size)
grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale
grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token and extra_tokens > 0:
pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
return pos_embed
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
return np.concatenate([emb_h, emb_w], axis=1)
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float64)
omega /= embed_dim / 2.
omega = 1. / 10000 ** omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
return np.concatenate([emb_sin, emb_cos], axis=1)
#################################################################################
# PixArt Configs #
#################################################################################
@MODELS.register_module()
def PixArt_XL_2(**kwargs):
return PixArt(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import torch
import torch.nn as nn
from timm.models.layers import DropPath
from timm.models.vision_transformer import Mlp
from diffusion.model.builder import MODELS
from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, SizeEmbedder
from diffusion.model.nets.PixArt import PixArt, get_2d_sincos_pos_embed
class PatchEmbed(nn.Module):
""" 2D Image to Patch Embedding
"""
def __init__(
self,
patch_size=16,
in_chans=3,
embed_dim=768,
norm_layer=None,
flatten=True,
bias=True,
):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.flatten = flatten
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
def forward(self, x):
x = self.proj(x)
if self.flatten:
x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
x = self.norm(x)
return x
class PixArtMSBlock(nn.Module):
"""
A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
"""
def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
super().__init__()
self.hidden_size = hidden_size
self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
input_size=input_size if window_size == 0 else (window_size, window_size),
use_rel_pos=use_rel_pos, **block_kwargs)
self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
# to be compatible with lower version pytorch
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.window_size = window_size
self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
def forward(self, x, y, t, mask=None, **kwargs):
B, N, C = x.shape
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)))
x = x + self.cross_attn(x, y, mask)
x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
return x
#############################################################################
# Core PixArt Model #
#################################################################################
@MODELS.register_module()
class PixArtMS(PixArt):
"""
Diffusion model with a Transformer backbone.
"""
def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1., config=None, model_max_length=120, **kwargs):
if window_block_indexes is None:
window_block_indexes = []
super().__init__(
input_size=input_size,
patch_size=patch_size,
in_channels=in_channels,
hidden_size=hidden_size,
depth=depth,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
class_dropout_prob=class_dropout_prob,
learn_sigma=learn_sigma,
pred_sigma=pred_sigma,
drop_path=drop_path,
window_size=window_size,
window_block_indexes=window_block_indexes,
use_rel_pos=use_rel_pos,
lewei_scale=lewei_scale,
config=config,
model_max_length=model_max_length,
**kwargs,
)
self.h = self.w = 0
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.t_block = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 6 * hidden_size, bias=True)
)
self.x_embedder = PatchEmbed(patch_size, in_channels, hidden_size, bias=True)
self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
self.csize_embedder = SizeEmbedder(hidden_size//3) # c_size embed
self.ar_embedder = SizeEmbedder(hidden_size//3) # aspect ratio embed
drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule
self.blocks = nn.ModuleList([
PixArtMSBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
input_size=(input_size // patch_size, input_size // patch_size),
window_size=window_size if i in window_block_indexes else 0,
use_rel_pos=use_rel_pos if i in window_block_indexes else False)
for i in range(depth)
])
self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)
self.initialize()
def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs):
"""
Forward pass of PixArt.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N, 1, 120, C) tensor of class labels
"""
bs = x.shape[0]
x = x.to(self.dtype)
timestep = timestep.to(self.dtype)
y = y.to(self.dtype)
c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2
t = self.t_embedder(timestep) # (N, D)
csize = self.csize_embedder(c_size, bs) # (N, D)
ar = self.ar_embedder(ar, bs) # (N, D)
t = t + torch.cat([csize, ar], dim=1)
t0 = self.t_block(t)
y = self.y_embedder(y, self.training) # (N, D)
if mask is not None:
if mask.shape[0] != y.shape[0]:
mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
mask = mask.squeeze(1).squeeze(1)
y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
y_lens = mask.sum(dim=1).tolist()
else:
y_lens = [y.shape[2]] * y.shape[0]
y = y.squeeze(1).view(1, -1, x.shape[-1])
for block in self.blocks:
x = auto_grad_checkpoint(block, x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint
x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
def forward_with_dpmsolver(self, x, timestep, y, data_info, **kwargs):
"""
dpm solver donnot need variance prediction
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
model_out = self.forward(x, timestep, y, data_info=data_info, **kwargs)
return model_out.chunk(2, dim=1)[0]
def forward_with_cfg(self, x, timestep, y, cfg_scale, data_info, **kwargs):
"""
Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
"""
# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
half = x[: len(x) // 2]
combined = torch.cat([half, half], dim=0)
model_out = self.forward(combined, timestep, y, data_info=data_info)
eps, rest = model_out[:, :3], model_out[:, 3:]
cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
eps = torch.cat([half_eps, half_eps], dim=0)
return torch.cat([eps, rest], dim=1)
def unpatchify(self, x):
"""
x: (N, T, patch_size**2 * C)
imgs: (N, H, W, C)
"""
c = self.out_channels
p = self.x_embedder.patch_size[0]
assert self.h * self.w == x.shape[1]
x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
x = torch.einsum('nhwpqc->nchpwq', x)
return x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))
def initialize(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
w = self.x_embedder.proj.weight.data
nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
# Initialize timestep embedding MLP:
nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.t_block[1].weight, std=0.02)
nn.init.normal_(self.csize_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.csize_embedder.mlp[2].weight, std=0.02)
nn.init.normal_(self.ar_embedder.mlp[0].weight, std=0.02)
nn.init.normal_(self.ar_embedder.mlp[2].weight, std=0.02)
# Initialize caption embedding MLP:
nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
# Zero-out adaLN modulation layers in PixArt blocks:
for block in self.blocks:
nn.init.constant_(block.cross_attn.proj.weight, 0)
nn.init.constant_(block.cross_attn.proj.bias, 0)
# Zero-out output layers:
nn.init.constant_(self.final_layer.linear.weight, 0)
nn.init.constant_(self.final_layer.linear.bias, 0)
#################################################################################
# PixArt Configs #
#################################################################################
@MODELS.register_module()
def PixArtMS_XL_2(**kwargs):
return PixArtMS(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import math
import torch
import torch.nn as nn
from timm.models.vision_transformer import Mlp, Attention as Attention_
from einops import rearrange, repeat
import xformers.ops
from diffusion.model.utils import add_decomposed_rel_pos
def modulate(x, shift, scale):
return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
def t2i_modulate(x, shift, scale):
return x * (1 + scale) + shift
class MultiHeadCrossAttention(nn.Module):
def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs):
super(MultiHeadCrossAttention, self).__init__()
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
self.d_model = d_model
self.num_heads = num_heads
self.head_dim = d_model // num_heads
self.q_linear = nn.Linear(d_model, d_model)
self.kv_linear = nn.Linear(d_model, d_model*2)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(d_model, d_model)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, cond, mask=None):
# query/value: img tokens; key: condition; mask: if padding tokens
B, N, C = x.shape
q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
k, v = kv.unbind(2)
attn_bias = None
if mask is not None:
attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
# x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.fmha.MemoryEfficientAttentionFlashAttentionOp)
x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.TritonFlashAttentionOp)
x = x.view(B, -1, C)
x = self.proj(x)
x = self.proj_drop(x)
# q = self.q_linear(x).reshape(B, -1, self.num_heads, self.head_dim)
# kv = self.kv_linear(cond).reshape(B, -1, 2, self.num_heads, self.head_dim)
# k, v = kv.unbind(2)
# attn_bias = None
# if mask is not None:
# attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
# attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
# x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
# x = x.contiguous().reshape(B, -1, C)
# x = self.proj(x)
# x = self.proj_drop(x)
return x
class WindowAttention(Attention_):
"""Multi-head Attention block with relative position embeddings."""
def __init__(
self,
dim,
num_heads=8,
qkv_bias=True,
use_rel_pos=False,
rel_pos_zero_init=True,
input_size=None,
**block_kwargs,
):
"""
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
qkv_bias (bool: If True, add a learnable bias to query, key, value.
rel_pos (bool): If True, add relative positional embeddings to the attention map.
rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
input_size (int or None): Input resolution for calculating the relative positional
parameter size.
"""
super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs)
self.use_rel_pos = use_rel_pos
if self.use_rel_pos:
# initialize relative positional embeddings
self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim))
self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim))
if not rel_pos_zero_init:
nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
def forward(self, x, mask=None):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
q, k, v = qkv.unbind(2)
if use_fp32_attention := getattr(self, 'fp32_attention', False):
q, k, v = q.float(), k.float(), v.float()
attn_bias = None
if mask is not None:
attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
# x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.fmha.MemoryEfficientAttentionFlashAttentionOp)
x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.TritonFlashAttentionOp)
x = x.view(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
#################################################################################
# AMP attention with fp32 softmax to fix loss NaN problem during training #
#################################################################################
class Attention(Attention_):
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
use_fp32_attention = getattr(self, 'fp32_attention', False)
if use_fp32_attention:
q, k = q.float(), k.float()
with torch.cuda.amp.autocast(enabled=not use_fp32_attention):
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class FinalLayer(nn.Module):
"""
The final layer of PixArt.
"""
def __init__(self, hidden_size, patch_size, out_channels):
super().__init__()
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 2 * hidden_size, bias=True)
)
def forward(self, x, c):
shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
x = modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
return x
class T2IFinalLayer(nn.Module):
"""
The final layer of PixArt.
"""
def __init__(self, hidden_size, patch_size, out_channels):
super().__init__()
self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
self.out_channels = out_channels
def forward(self, x, t):
shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
x = t2i_modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
return x
class MaskFinalLayer(nn.Module):
"""
The final layer of PixArt.
"""
def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels):
super().__init__()
self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True)
)
def forward(self, x, t):
shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
x = modulate(self.norm_final(x), shift, scale)
x = self.linear(x)
return x
class DecoderLayer(nn.Module):
"""
The final layer of PixArt.
"""
def __init__(self, hidden_size, decoder_hidden_size):
super().__init__()
self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 2 * hidden_size, bias=True)
)
def forward(self, x, t):
shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
x = modulate(self.norm_decoder(x), shift, scale)
x = self.linear(x)
return x
#################################################################################
# Embedding Layers for Timesteps and Class Labels #
#################################################################################
class TimestepEmbedder(nn.Module):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
@staticmethod
def timestep_embedding(t, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
half = dim // 2
freqs = torch.exp(
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype)
return self.mlp(t_freq)
@property
def dtype(self):
# 返回模型参数的数据类型
return next(self.parameters()).dtype
class SizeEmbedder(TimestepEmbedder):
"""
Embeds scalar timesteps into vector representations.
"""
def __init__(self, hidden_size, frequency_embedding_size=256):
super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size, bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size, bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
self.outdim = hidden_size
def forward(self, s, bs):
if s.ndim == 1:
s = s[:, None]
assert s.ndim == 2
if s.shape[0] != bs:
s = s.repeat(bs//s.shape[0], 1)
assert s.shape[0] == bs
b, dims = s.shape[0], s.shape[1]
s = rearrange(s, "b d -> (b d)")
s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
s_emb = self.mlp(s_freq)
s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
return s_emb
@property
def dtype(self):
# 返回模型参数的数据类型
return next(self.parameters()).dtype
class LabelEmbedder(nn.Module):
"""
Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
"""
def __init__(self, num_classes, hidden_size, dropout_prob):
super().__init__()
use_cfg_embedding = dropout_prob > 0
self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
self.num_classes = num_classes
self.dropout_prob = dropout_prob
def token_drop(self, labels, force_drop_ids=None):
"""
Drops labels to enable classifier-free guidance.
"""
if force_drop_ids is None:
drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
else:
drop_ids = force_drop_ids == 1
labels = torch.where(drop_ids, self.num_classes, labels)
return labels
def forward(self, labels, train, force_drop_ids=None):
use_dropout = self.dropout_prob > 0
if (train and use_dropout) or (force_drop_ids is not None):
labels = self.token_drop(labels, force_drop_ids)
return self.embedding_table(labels)
class CaptionEmbedder(nn.Module):
"""
Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
"""
def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
super().__init__()
self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
self.uncond_prob = uncond_prob
def token_drop(self, caption, force_drop_ids=None):
"""
Drops labels to enable classifier-free guidance.
"""
if force_drop_ids is None:
drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
else:
drop_ids = force_drop_ids == 1
caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
return caption
def forward(self, caption, train, force_drop_ids=None):
if train:
assert caption.shape[2:] == self.y_embedding.shape
use_dropout = self.uncond_prob > 0
if (train and use_dropout) or (force_drop_ids is not None):
caption = self.token_drop(caption, force_drop_ids)
caption = self.y_proj(caption)
return caption
class CaptionEmbedderDoubleBr(nn.Module):
"""
Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
"""
def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
super().__init__()
self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5)
self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5)
self.uncond_prob = uncond_prob
def token_drop(self, global_caption, caption, force_drop_ids=None):
"""
Drops labels to enable classifier-free guidance.
"""
if force_drop_ids is None:
drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob
else:
drop_ids = force_drop_ids == 1
global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption)
caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
return global_caption, caption
def forward(self, caption, train, force_drop_ids=None):
assert caption.shape[2: ] == self.y_embedding.shape
global_caption = caption.mean(dim=2).squeeze()
use_dropout = self.uncond_prob > 0
if (train and use_dropout) or (force_drop_ids is not None):
global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids)
y_embed = self.proj(global_caption)
return y_embed, caption
from .PixArt import PixArt, PixArt_XL_2
from .PixArtMS import PixArtMS, PixArtMS_XL_2, PixArtMSBlock
from .pixart_controlnet import ControlPixArtHalf, ControlPixArtMSHalf
\ No newline at end of file
import re
import torch
import torch.nn as nn
from copy import deepcopy
from torch import Tensor
from torch.nn import Module, Linear, init
from typing import Any, Mapping
from diffusion.model.nets import PixArtMSBlock, PixArtMS, PixArt
from diffusion.model.nets.PixArt import get_2d_sincos_pos_embed
from diffusion.model.utils import auto_grad_checkpoint
# The implementation of ControlNet-Half architrecture
# https://github.com/lllyasviel/ControlNet/discussions/188
class ControlT2IDitBlockHalf(Module):
def __init__(self, base_block: PixArtMSBlock, block_index: 0) -> None:
super().__init__()
self.copied_block = deepcopy(base_block)
self.block_index = block_index
for p in self.copied_block.parameters():
p.requires_grad_(True)
self.copied_block.load_state_dict(base_block.state_dict())
self.copied_block.train()
self.hidden_size = hidden_size = base_block.hidden_size
if self.block_index == 0:
self.before_proj = Linear(hidden_size, hidden_size)
init.zeros_(self.before_proj.weight)
init.zeros_(self.before_proj.bias)
self.after_proj = Linear(hidden_size, hidden_size)
init.zeros_(self.after_proj.weight)
init.zeros_(self.after_proj.bias)
def forward(self, x, y, t, mask=None, c=None):
if self.block_index == 0:
# the first block
c = self.before_proj(c)
c = self.copied_block(x + c, y, t, mask)
c_skip = self.after_proj(c)
else:
# load from previous c and produce the c for skip connection
c = self.copied_block(c, y, t, mask)
c_skip = self.after_proj(c)
return c, c_skip
# The implementation of ControlPixArtHalf net
class ControlPixArtHalf(Module):
# only support single res model
def __init__(self, base_model: PixArt, copy_blocks_num: int = 13) -> None:
super().__init__()
self.base_model = base_model.eval()
self.controlnet = []
self.copy_blocks_num = copy_blocks_num
self.total_blocks_num = len(base_model.blocks)
for p in self.base_model.parameters():
p.requires_grad_(False)
# Copy first copy_blocks_num block
for i in range(copy_blocks_num):
self.controlnet.append(ControlT2IDitBlockHalf(base_model.blocks[i], i))
self.controlnet = nn.ModuleList(self.controlnet)
def __getattr__(self, name: str) -> Tensor or Module:
if name in ['forward', 'forward_with_dpmsolver', 'forward_with_cfg', 'forward_c', 'load_state_dict']:
return self.__dict__[name]
elif name in ['base_model', 'controlnet']:
return super().__getattr__(name)
else:
return getattr(self.base_model, name)
def forward_c(self, c):
self.h, self.w = c.shape[-2]//self.patch_size, c.shape[-1]//self.patch_size
pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(c.device).to(self.dtype)
return self.x_embedder(c) + pos_embed if c is not None else c
# def forward(self, x, t, c, **kwargs):
# return self.base_model(x, t, c=self.forward_c(c), **kwargs)
def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
# modify the original PixArtMS forward function
if c is not None:
c = c.to(self.dtype)
c = self.forward_c(c)
"""
Forward pass of PixArt.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N, 1, 120, C) tensor of class labels
"""
x = x.to(self.dtype)
timestep = timestep.to(self.dtype)
y = y.to(self.dtype)
pos_embed = self.pos_embed.to(self.dtype)
self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2
t = self.t_embedder(timestep.to(x.dtype)) # (N, D)
t0 = self.t_block(t)
y = self.y_embedder(y, self.training) # (N, 1, L, D)
if mask is not None:
if mask.shape[0] != y.shape[0]:
mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
mask = mask.squeeze(1).squeeze(1)
y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
y_lens = mask.sum(dim=1).tolist()
else:
y_lens = [y.shape[2]] * y.shape[0]
y = y.squeeze(1).view(1, -1, x.shape[-1])
# define the first layer
x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint
if c is not None:
# update c
for index in range(1, self.copy_blocks_num + 1):
c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
# update x
for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
else:
for index in range(1, self.total_blocks_num):
x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
model_out = self.forward(x, t, y, data_info=data_info, c=c, **kwargs)
return model_out.chunk(2, dim=1)[0]
# def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
# return self.base_model.forward_with_dpmsolver(x, t, y, data_info=data_info, c=self.forward_c(c), **kwargs)
def forward_with_cfg(self, x, t, y, cfg_scale, data_info, c, **kwargs):
return self.base_model.forward_with_cfg(x, t, y, cfg_scale, data_info, c=self.forward_c(c), **kwargs)
def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
if all((k.startswith('base_model') or k.startswith('controlnet')) for k in state_dict.keys()):
return super().load_state_dict(state_dict, strict)
else:
new_key = {}
for k in state_dict.keys():
new_key[k] = re.sub(r"(blocks\.\d+)(.*)", r"\1.base_block\2", k)
for k, v in new_key.items():
if k != v:
print(f"replace {k} to {v}")
state_dict[v] = state_dict.pop(k)
return self.base_model.load_state_dict(state_dict, strict)
def unpatchify(self, x):
"""
x: (N, T, patch_size**2 * C)
imgs: (N, H, W, C)
"""
c = self.out_channels
p = self.x_embedder.patch_size[0]
assert self.h * self.w == x.shape[1]
x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
x = torch.einsum('nhwpqc->nchpwq', x)
imgs = x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))
return imgs
@property
def dtype(self):
# 返回模型参数的数据类型
return next(self.parameters()).dtype
# The implementation for PixArtMS_Half + 1024 resolution
class ControlPixArtMSHalf(ControlPixArtHalf):
# support multi-scale res model (multi-scale model can also be applied to single reso training & inference)
def __init__(self, base_model: PixArtMS, copy_blocks_num: int = 13) -> None:
super().__init__(base_model=base_model, copy_blocks_num=copy_blocks_num)
def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
# modify the original PixArtMS forward function
"""
Forward pass of PixArt.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N, 1, 120, C) tensor of class labels
"""
if c is not None:
c = c.to(self.dtype)
c = self.forward_c(c)
bs = x.shape[0]
x = x.to(self.dtype)
timestep = timestep.to(self.dtype)
y = y.to(self.dtype)
c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2
t = self.t_embedder(timestep) # (N, D)
csize = self.csize_embedder(c_size, bs) # (N, D)
ar = self.ar_embedder(ar, bs) # (N, D)
t = t + torch.cat([csize, ar], dim=1)
t0 = self.t_block(t)
y = self.y_embedder(y, self.training) # (N, D)
if mask is not None:
if mask.shape[0] != y.shape[0]:
mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
mask = mask.squeeze(1).squeeze(1)
y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
y_lens = mask.sum(dim=1).tolist()
else:
y_lens = [y.shape[2]] * y.shape[0]
y = y.squeeze(1).view(1, -1, x.shape[-1])
# define the first layer
x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs) # (N, T, D) #support grad checkpoint
if c is not None:
# update c
for index in range(1, self.copy_blocks_num + 1):
c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
# update x
for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
else:
for index in range(1, self.total_blocks_num):
x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels)
x = self.unpatchify(x) # (N, out_channels, H, W)
return x
# Modified from OpenAI's diffusion repos
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
import numpy as np
import torch as th
from .gaussian_diffusion import GaussianDiffusion
def space_timesteps(num_timesteps, section_counts):
"""
Create a list of timesteps to use from an original diffusion process,
given the number of timesteps we want to take from equally-sized portions
of the original process.
For example, if there's 300 timesteps and the section counts are [10,15,20]
then the first 100 timesteps are strided to be 10 timesteps, the second 100
are strided to be 15 timesteps, and the final 100 are strided to be 20.
If the stride is a string starting with "ddim", then the fixed striding
from the DDIM paper is used, and only one section is allowed.
:param num_timesteps: the number of diffusion steps in the original
process to divide up.
:param section_counts: either a list of numbers, or a string containing
comma-separated numbers, indicating the step count
per section. As a special case, use "ddimN" where N
is a number of steps to use the striding from the
DDIM paper.
:return: a set of diffusion steps from the original process to use.
"""
if isinstance(section_counts, str):
if section_counts.startswith("ddim"):
desired_count = int(section_counts[len("ddim") :])
for i in range(1, num_timesteps):
if len(range(0, num_timesteps, i)) == desired_count:
return set(range(0, num_timesteps, i))
raise ValueError(
f"cannot create exactly {num_timesteps} steps with an integer stride"
)
section_counts = [int(x) for x in section_counts.split(",")]
size_per = num_timesteps // len(section_counts)
extra = num_timesteps % len(section_counts)
start_idx = 0
all_steps = []
for i, section_count in enumerate(section_counts):
size = size_per + (1 if i < extra else 0)
if size < section_count:
raise ValueError(
f"cannot divide section of {size} steps into {section_count}"
)
frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1)
cur_idx = 0.0
taken_steps = []
for _ in range(section_count):
taken_steps.append(start_idx + round(cur_idx))
cur_idx += frac_stride
all_steps += taken_steps
start_idx += size
return set(all_steps)
class SpacedDiffusion(GaussianDiffusion):
"""
A diffusion process which can skip steps in a base diffusion process.
:param use_timesteps: a collection (sequence or set) of timesteps from the
original diffusion process to retain.
:param kwargs: the kwargs to create the base diffusion process.
"""
def __init__(self, use_timesteps, **kwargs):
self.use_timesteps = set(use_timesteps)
self.timestep_map = []
self.original_num_steps = len(kwargs["betas"])
base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
last_alpha_cumprod = 1.0
new_betas = []
for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
if i in self.use_timesteps:
new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
last_alpha_cumprod = alpha_cumprod
self.timestep_map.append(i)
kwargs["betas"] = np.array(new_betas)
super().__init__(**kwargs)
def p_mean_variance(
self, model, *args, **kwargs
): # pylint: disable=signature-differs
return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
def training_losses(
self, model, *args, **kwargs
): # pylint: disable=signature-differs
return super().training_losses(self._wrap_model(model), *args, **kwargs)
def training_losses_diffusers(
self, model, *args, **kwargs
): # pylint: disable=signature-differs
return super().training_losses_diffusers(self._wrap_model(model), *args, **kwargs)
def condition_mean(self, cond_fn, *args, **kwargs):
return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
def condition_score(self, cond_fn, *args, **kwargs):
return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
def _wrap_model(self, model):
if isinstance(model, _WrappedModel):
return model
return _WrappedModel(
model, self.timestep_map, self.original_num_steps
)
def _scale_timesteps(self, t):
# Scaling is done by the wrapped model.
return t
class _WrappedModel:
def __init__(self, model, timestep_map, original_num_steps):
self.model = model
self.timestep_map = timestep_map
# self.rescale_timesteps = rescale_timesteps
self.original_num_steps = original_num_steps
def __call__(self, x, timestep, **kwargs):
map_tensor = th.tensor(self.timestep_map, device=timestep.device, dtype=timestep.dtype)
new_ts = map_tensor[timestep]
# if self.rescale_timesteps:
# new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
return self.model(x, timestep=new_ts, **kwargs)
import torch
import torch.nn.functional as F
import math
from tqdm import tqdm
class NoiseScheduleVP:
def __init__(
self,
schedule='discrete',
betas=None,
alphas_cumprod=None,
continuous_beta_0=0.1,
continuous_beta_1=20.,
dtype=torch.float32,
):
"""Thanks to DPM-Solver for their code base"""
"""Create a wrapper class for the forward SDE (VP type).
***
Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
***
The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
log_alpha_t = self.marginal_log_mean_coeff(t)
sigma_t = self.marginal_std(t)
lambda_t = self.marginal_lambda(t)
Moreover, as lambda(t) is an invertible function, we also support its inverse function:
t = self.inverse_lambda(lambda_t)
===============================================================
We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
1. For discrete-time DPMs:
For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
t_i = (i + 1) / N
e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
Args:
betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
**Important**: Please pay special attention for the args for `alphas_cumprod`:
The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
alpha_{t_n} = \sqrt{\hat{alpha_n}},
and
log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
2. For continuous-time DPMs:
We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
schedule are the default settings in DDPM and improved-DDPM:
Args:
beta_min: A `float` number. The smallest beta for the linear schedule.
beta_max: A `float` number. The largest beta for the linear schedule.
cosine_s: A `float` number. The hyperparameter in the cosine schedule.
cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
T: A `float` number. The ending time of the forward process.
===============================================================
Args:
schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
'linear' or 'cosine' for continuous-time DPMs.
Returns:
A wrapper object of the forward SDE (VP type).
===============================================================
Example:
# For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
>>> ns = NoiseScheduleVP('discrete', betas=betas)
# For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
>>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
# For continuous-time DPMs (VPSDE), linear schedule:
>>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
"""
if schedule not in ['discrete', 'linear', 'cosine']:
raise ValueError(
f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
)
self.schedule = schedule
if schedule == 'discrete':
if betas is not None:
log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
else:
assert alphas_cumprod is not None
log_alphas = 0.5 * torch.log(alphas_cumprod)
self.total_N = len(log_alphas)
self.T = 1.
self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype)
else:
self.total_N = 1000
self.beta_0 = continuous_beta_0
self.beta_1 = continuous_beta_1
self.cosine_s = 0.008
self.cosine_beta_max = 999.
self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
1. + self.cosine_s) / math.pi - self.cosine_s
self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
self.schedule = schedule
self.T = 0.9946 if schedule == 'cosine' else 1.
def marginal_log_mean_coeff(self, t):
"""
Compute log(alpha_t) of a given continuous-time label t in [0, T].
"""
if self.schedule == 'discrete':
return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
self.log_alpha_array.to(t.device)).reshape((-1))
elif self.schedule == 'linear':
return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
elif self.schedule == 'cosine':
log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
return log_alpha_fn(t) - self.cosine_log_alpha_0
def marginal_alpha(self, t):
"""
Compute alpha_t of a given continuous-time label t in [0, T].
"""
return torch.exp(self.marginal_log_mean_coeff(t))
def marginal_std(self, t):
"""
Compute sigma_t of a given continuous-time label t in [0, T].
"""
return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
def marginal_lambda(self, t):
"""
Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
"""
log_mean_coeff = self.marginal_log_mean_coeff(t)
log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
return log_mean_coeff - log_std
def inverse_lambda(self, lamb):
"""
Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
"""
if self.schedule == 'linear':
tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
Delta = self.beta_0 ** 2 + tmp
return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
elif self.schedule == 'discrete':
log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
torch.flip(self.t_array.to(lamb.device), [1]))
return t.reshape((-1,))
else:
log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
1. + self.cosine_s) / math.pi - self.cosine_s
return t_fn(log_alpha)
def edm_sigma(self, t):
return self.marginal_std(t) / self.marginal_alpha(t)
def edm_inverse_sigma(self, edmsigma):
alpha = 1 / (edmsigma ** 2 + 1).sqrt()
sigma = alpha * edmsigma
lambda_t = torch.log(alpha / sigma)
return self.inverse_lambda(lambda_t)
def model_wrapper(
model,
noise_schedule,
model_type="noise",
model_kwargs={},
guidance_type="uncond",
condition=None,
unconditional_condition=None,
guidance_scale=1.,
classifier_fn=None,
classifier_kwargs={},
):
"""Thanks to DPM-Solver for their code base"""
"""Create a wrapper function for the noise prediction model.
SA-Solver needs to solve the continuous-time diffusion SDEs. For DPMs trained on discrete-time labels, we need to
firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
We support four types of the diffusion model by setting `model_type`:
1. "noise": noise prediction model. (Trained by predicting noise).
2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
3. "v": velocity prediction model. (Trained by predicting the velocity).
The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
[1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
arXiv preprint arXiv:2202.00512 (2022).
[2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
arXiv preprint arXiv:2210.02303 (2022).
4. "score": marginal score function. (Trained by denoising score matching).
Note that the score function and the noise prediction model follows a simple relationship:
```
noise(x_t, t) = -sigma_t * score(x_t, t)
```
We support three types of guided sampling by DPMs by setting `guidance_type`:
1. "uncond": unconditional sampling by DPMs.
The input `model` has the following format:
``
model(x, t_input, **model_kwargs) -> noise | x_start | v | score
``
2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
The input `model` has the following format:
``
model(x, t_input, **model_kwargs) -> noise | x_start | v | score
``
The input `classifier_fn` has the following format:
``
classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
``
[3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
The input `model` has the following format:
``
model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
``
And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
[4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
arXiv preprint arXiv:2207.12598 (2022).
The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
or continuous-time labels (i.e. epsilon to T).
We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
``
def model_fn(x, t_continuous) -> noise:
t_input = get_model_input_time(t_continuous)
return noise_pred(model, x, t_input, **model_kwargs)
``
where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for SA-Solver.
===============================================================
Args:
model: A diffusion model with the corresponding format described above.
noise_schedule: A noise schedule object, such as NoiseScheduleVP.
model_type: A `str`. The parameterization type of the diffusion model.
"noise" or "x_start" or "v" or "score".
model_kwargs: A `dict`. A dict for the other inputs of the model function.
guidance_type: A `str`. The type of the guidance for sampling.
"uncond" or "classifier" or "classifier-free".
condition: A pytorch tensor. The condition for the guided sampling.
Only used for "classifier" or "classifier-free" guidance type.
unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
Only used for "classifier-free" guidance type.
guidance_scale: A `float`. The scale for the guided sampling.
classifier_fn: A classifier function. Only used for the classifier guidance.
classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
Returns:
A noise prediction model that accepts the noised data and the continuous time as the inputs.
"""
def get_model_input_time(t_continuous):
"""
Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
For continuous-time DPMs, we just use `t_continuous`.
"""
if noise_schedule.schedule == 'discrete':
return (t_continuous - 1. / noise_schedule.total_N) * 1000.
else:
return t_continuous
def noise_pred_fn(x, t_continuous, cond=None):
t_input = get_model_input_time(t_continuous)
if cond is None:
output = model(x, t_input, **model_kwargs)
else:
output = model(x, t_input, cond, **model_kwargs)
if model_type == "noise":
return output
elif model_type == "x_start":
alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
return (x - alpha_t[0] * output) / sigma_t[0]
elif model_type == "v":
alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
return alpha_t[0] * output + sigma_t[0] * x
elif model_type == "score":
sigma_t = noise_schedule.marginal_std(t_continuous)
return -sigma_t[0] * output
def cond_grad_fn(x, t_input):
"""
Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
"""
with torch.enable_grad():
x_in = x.detach().requires_grad_(True)
log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
return torch.autograd.grad(log_prob.sum(), x_in)[0]
def model_fn(x, t_continuous):
"""
The noise predicition model function that is used for DPM-Solver.
"""
if guidance_type == "uncond":
return noise_pred_fn(x, t_continuous)
elif guidance_type == "classifier":
assert classifier_fn is not None
t_input = get_model_input_time(t_continuous)
cond_grad = cond_grad_fn(x, t_input)
sigma_t = noise_schedule.marginal_std(t_continuous)
noise = noise_pred_fn(x, t_continuous)
return noise - guidance_scale * sigma_t * cond_grad
elif guidance_type == "classifier-free":
if guidance_scale == 1. or unconditional_condition is None:
return noise_pred_fn(x, t_continuous, cond=condition)
x_in = torch.cat([x] * 2)
t_in = torch.cat([t_continuous] * 2)
c_in = torch.cat([unconditional_condition, condition])
noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
return noise_uncond + guidance_scale * (noise - noise_uncond)
assert model_type in ["noise", "x_start", "v", "score"]
assert guidance_type in ["uncond", "classifier", "classifier-free"]
return model_fn
class SASolver:
def __init__(
self,
model_fn,
noise_schedule,
algorithm_type="data_prediction",
correcting_x0_fn=None,
correcting_xt_fn=None,
thresholding_max_val=1.,
dynamic_thresholding_ratio=0.995
):
"""
Construct a SA-Solver
The default value for algorithm_type is "data_prediction" and we recommend not to change it to
"noise_prediction". For details, please see Appendix A.2.4 in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
"""
self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
self.noise_schedule = noise_schedule
assert algorithm_type in ["data_prediction", "noise_prediction"]
if correcting_x0_fn == "dynamic_thresholding":
self.correcting_x0_fn = self.dynamic_thresholding_fn
else:
self.correcting_x0_fn = correcting_x0_fn
self.correcting_xt_fn = correcting_xt_fn
self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
self.thresholding_max_val = thresholding_max_val
self.predict_x0 = algorithm_type == "data_prediction"
self.sigma_min = float(self.noise_schedule.edm_sigma(torch.tensor([1e-3])))
self.sigma_max = float(self.noise_schedule.edm_sigma(torch.tensor([1])))
def dynamic_thresholding_fn(self, x0, t=None):
"""
The dynamic thresholding method.
"""
dims = x0.dim()
p = self.dynamic_thresholding_ratio
s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
x0 = torch.clamp(x0, -s, s) / s
return x0
def noise_prediction_fn(self, x, t):
"""
Return the noise prediction model.
"""
return self.model(x, t)
def data_prediction_fn(self, x, t):
"""
Return the data prediction model (with corrector).
"""
noise = self.noise_prediction_fn(x, t)
alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
x0 = (x - sigma_t * noise) / alpha_t
if self.correcting_x0_fn is not None:
x0 = self.correcting_x0_fn(x0)
return x0
def model_fn(self, x, t):
"""
Convert the model to the noise prediction model or the data prediction model.
"""
if self.predict_x0:
return self.data_prediction_fn(x, t)
else:
return self.noise_prediction_fn(x, t)
def get_time_steps(self, skip_type, t_T, t_0, N, order, device):
"""Compute the intermediate time steps for sampling.
"""
if skip_type == 'logSNR':
lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
logSNR_steps = lambda_T + torch.linspace(torch.tensor(0.).cpu().item(),
(lambda_0 - lambda_T).cpu().item() ** (1. / order), N + 1).pow(
order).to(device)
return self.noise_schedule.inverse_lambda(logSNR_steps)
elif skip_type == 'time':
t = torch.linspace(t_T ** (1. / order), t_0 ** (1. / order), N + 1).pow(order).to(device)
return t
elif skip_type == 'karras':
sigma_min = max(0.002, self.sigma_min)
sigma_max = min(80, self.sigma_max)
sigma_steps = torch.linspace(sigma_max ** (1. / 7), sigma_min ** (1. / 7), N + 1).pow(7).to(device)
return self.noise_schedule.edm_inverse_sigma(sigma_steps)
else:
raise ValueError(
f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time' or 'karras'"
)
def denoise_to_zero_fn(self, x, s):
"""
Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
"""
return self.data_prediction_fn(x, s)
def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
"""
Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
For calculating the coefficient of gradient terms after the lagrange interpolation,
see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
For noise_prediction formula.
"""
assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
if order == 0:
return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
elif order == 1:
return torch.exp(-interval_end) * (
(interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
elif order == 2:
return torch.exp(-interval_end) * (
(interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
interval_end ** 2 + 2 * interval_end + 2))
elif order == 3:
return torch.exp(-interval_end) * (
(interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
"""
Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
For calculating the coefficient of gradient terms after the lagrange interpolation,
see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
For data_prediction formula.
"""
assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
# after change of variable(cov)
interval_end_cov = (1 + tau ** 2) * interval_end
interval_start_cov = (1 + tau ** 2) * interval_start
if order == 0:
return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
(1 + tau ** 2))
elif order == 1:
return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
-(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
elif order == 2:
return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
-(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
elif order == 3:
return torch.exp(interval_end_cov) * (
(interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
-(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
def lagrange_polynomial_coefficient(self, order, lambda_list):
"""
Calculate the coefficient of lagrange polynomial
For lagrange interpolation
"""
assert order in [0, 1, 2, 3]
assert order == len(lambda_list) - 1
if order == 0:
return [[1]]
elif order == 1:
return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
[1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
elif order == 2:
denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
return [[1 / denominator1,
(-lambda_list[1] - lambda_list[2]) / denominator1,
lambda_list[1] * lambda_list[2] / denominator1],
[1 / denominator2,
(-lambda_list[0] - lambda_list[2]) / denominator2,
lambda_list[0] * lambda_list[2] / denominator2],
[1 / denominator3,
(-lambda_list[0] - lambda_list[1]) / denominator3,
lambda_list[0] * lambda_list[1] / denominator3]
]
elif order == 3:
denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
lambda_list[0] - lambda_list[3])
denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
lambda_list[1] - lambda_list[3])
denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
lambda_list[2] - lambda_list[3])
denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
lambda_list[3] - lambda_list[2])
return [[1 / denominator1,
(-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
(lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
3]) / denominator1,
(-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
[1 / denominator2,
(-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
(lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
3]) / denominator2,
(-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
[1 / denominator3,
(-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
(lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
3]) / denominator3,
(-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
[1 / denominator4,
(-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
(lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
2]) / denominator4,
(-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
]
def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
"""
Calculate the coefficient of gradients.
"""
assert order in [1, 2, 3, 4]
assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
coefficients = []
lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
for i in range(order):
coefficient = sum(
lagrange_coefficient[i][j]
* self.get_coefficients_exponential_positive(
order - 1 - j, interval_start, interval_end, tau
)
if self.predict_x0
else lagrange_coefficient[i][j]
* self.get_coefficients_exponential_negative(
order - 1 - j, interval_start, interval_end
)
for j in range(order)
)
coefficients.append(coefficient)
assert len(coefficients) == order, 'the length of coefficients does not match the order'
return coefficients
def adams_bashforth_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
"""
SA-Predictor, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
"""
assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
# get noise schedule
ns = self.noise_schedule
alpha_t = ns.marginal_alpha(t)
sigma_t = ns.marginal_std(t)
lambda_t = ns.marginal_lambda(t)
alpha_prev = ns.marginal_alpha(t_prev_list[-1])
sigma_prev = ns.marginal_std(t_prev_list[-1])
gradient_part = torch.zeros_like(x)
h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
lambda_list, tau)
for i in range(order):
if self.predict_x0:
gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
i] * model_prev_list[-(i + 1)]
else:
gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
if self.predict_x0:
noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
else:
noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
if self.predict_x0:
x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
else:
x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
return x_t
def adams_moulton_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
"""
SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
"""
assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
# get noise schedule
ns = self.noise_schedule
alpha_t = ns.marginal_alpha(t)
sigma_t = ns.marginal_std(t)
lambda_t = ns.marginal_lambda(t)
alpha_prev = ns.marginal_alpha(t_prev_list[-1])
sigma_prev = ns.marginal_std(t_prev_list[-1])
gradient_part = torch.zeros_like(x)
h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
t_list = t_prev_list + [t]
lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
lambda_list, tau)
for i in range(order):
if self.predict_x0:
gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
i] * model_prev_list[-(i + 1)]
else:
gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
if self.predict_x0:
noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
else:
noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
if self.predict_x0:
x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
else:
x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
return x_t
def adams_bashforth_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
"""
SA-Predictor, with the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
"""
assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
# get noise schedule
ns = self.noise_schedule
alpha_t = ns.marginal_alpha(t)
sigma_t = ns.marginal_std(t)
lambda_t = ns.marginal_lambda(t)
alpha_prev = ns.marginal_alpha(t_prev_list[-1])
sigma_prev = ns.marginal_std(t_prev_list[-1])
gradient_part = torch.zeros_like(x)
h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
lambda_list, tau)
if self.predict_x0:
if order == 2: ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
# The added term is O(h^3). Empirically we find it will slightly improve the image quality.
# ODE case
# gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
# gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
(1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
t_prev_list[-2]))
gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
(1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
t_prev_list[-2]))
for i in range(order):
if self.predict_x0:
gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
i] * model_prev_list[-(i + 1)]
else:
gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
if self.predict_x0:
noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
else:
noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
if self.predict_x0:
x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
else:
x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
return x_t
def adams_moulton_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
"""
SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
"""
assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
# get noise schedule
ns = self.noise_schedule
alpha_t = ns.marginal_alpha(t)
sigma_t = ns.marginal_std(t)
lambda_t = ns.marginal_lambda(t)
alpha_prev = ns.marginal_alpha(t_prev_list[-1])
sigma_prev = ns.marginal_std(t_prev_list[-1])
gradient_part = torch.zeros_like(x)
h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
t_list = t_prev_list + [t]
lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
lambda_list, tau)
if self.predict_x0:
if order == 2: ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
# The added term is O(h^3). Empirically we find it will slightly improve the image quality.
# ODE case
# gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
# gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
(1 + tau ** 2) ** 2 * h))
gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
(1 + tau ** 2) ** 2 * h))
for i in range(order):
if self.predict_x0:
gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
i] * model_prev_list[-(i + 1)]
else:
gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
if self.predict_x0:
noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
else:
noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
if self.predict_x0:
x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
else:
x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
return x_t
def sample_few_steps(self, x, tau, steps=5, t_start=None, t_end=None, skip_type='time', skip_order=1,
predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
):
"""
For the PC-mode, please refer to the wiki page
https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
"""
skip_first_step = False
skip_final_step = True
lower_order_final = True
denoise_to_zero = False
assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
t_T = self.noise_schedule.T if t_start is None else t_start
assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
device = x.device
intermediates = []
with torch.no_grad():
assert steps >= max(predictor_order, corrector_order - 1)
timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
device=device)
assert timesteps.shape[0] - 1 == steps
# Init the initial values.
step = 0
t = timesteps[step]
noise = torch.randn_like(x)
t_prev_list = [t]
# do not evaluate if skip_first_step
if skip_first_step:
if self.predict_x0:
alpha_t = self.noise_schedule.marginal_alpha(t)
sigma_t = self.noise_schedule.marginal_std(t)
model_prev_list = [(1 - sigma_t) / alpha_t * x]
else:
model_prev_list = [x]
else:
model_prev_list = [self.model_fn(x, t)]
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
# determine the first several values
for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
t = timesteps[step]
predictor_order_used = min(predictor_order, step)
corrector_order_used = min(corrector_order, step + 1)
noise = torch.randn_like(x)
# predictor step
x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list,
noise=noise, t=t)
# evaluation step
model_x = self.model_fn(x_p, t)
# update model_list
model_prev_list.append(model_x)
# corrector step
if corrector_order > 0:
x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list,
noise=noise, t=t)
else:
x = x_p
# evaluation step if correction and mode = pece
if corrector_order > 0 and pc_mode == 'PECE':
model_x = self.model_fn(x, t)
del model_prev_list[-1]
model_prev_list.append(model_x)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
t_prev_list.append(t)
for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
if lower_order_final:
predictor_order_used = min(predictor_order, steps - step + 1)
corrector_order_used = min(corrector_order, steps - step + 2)
else:
predictor_order_used = predictor_order
corrector_order_used = corrector_order
t = timesteps[step]
noise = torch.randn_like(x)
# predictor step
if skip_final_step and step == steps and not denoise_to_zero:
x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=0,
model_prev_list=model_prev_list,
t_prev_list=t_prev_list, noise=noise, t=t)
else:
x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list,
t_prev_list=t_prev_list, noise=noise, t=t)
# evaluation step
# do not evaluate if skip_final_step and step = steps
if not skip_final_step or step < steps:
model_x = self.model_fn(x_p, t)
# update model_list
# do not update if skip_final_step and step = steps
if not skip_final_step or step < steps:
model_prev_list.append(model_x)
# corrector step
# do not correct if skip_final_step and step = steps
if corrector_order > 0 and (not skip_final_step or step < steps):
x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list,
t_prev_list=t_prev_list, noise=noise, t=t)
else:
x = x_p
# evaluation step if mode = pece and step != steps
if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
model_x = self.model_fn(x, t)
del model_prev_list[-1]
model_prev_list.append(model_x)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
t_prev_list.append(t)
del model_prev_list[0]
if denoise_to_zero:
t = torch.ones((1,)).to(device) * t_0
x = self.denoise_to_zero_fn(x, t)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step + 1)
if return_intermediate:
intermediates.append(x)
return (x, intermediates) if return_intermediate else x
def sample_more_steps(self, x, tau, steps=20, t_start=None, t_end=None, skip_type='time', skip_order=1,
predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
):
"""
For the PC-mode, please refer to the wiki page
https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
"""
skip_first_step = False
skip_final_step = False
lower_order_final = True
denoise_to_zero = True
assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
t_T = self.noise_schedule.T if t_start is None else t_start
assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
device = x.device
intermediates = []
with torch.no_grad():
assert steps >= max(predictor_order, corrector_order - 1)
timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
device=device)
assert timesteps.shape[0] - 1 == steps
# Init the initial values.
step = 0
t = timesteps[step]
noise = torch.randn_like(x)
t_prev_list = [t]
# do not evaluate if skip_first_step
if skip_first_step:
if self.predict_x0:
alpha_t = self.noise_schedule.marginal_alpha(t)
sigma_t = self.noise_schedule.marginal_std(t)
model_prev_list = [(1 - sigma_t) / alpha_t * x]
else:
model_prev_list = [x]
else:
model_prev_list = [self.model_fn(x, t)]
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
# determine the first several values
for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
t = timesteps[step]
predictor_order_used = min(predictor_order, step)
corrector_order_used = min(corrector_order, step + 1)
noise = torch.randn_like(x)
# predictor step
x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
t=t)
# evaluation step
model_x = self.model_fn(x_p, t)
# update model_list
model_prev_list.append(model_x)
# corrector step
if corrector_order > 0:
x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
t=t)
else:
x = x_p
# evaluation step if mode = pece
if corrector_order > 0 and pc_mode == 'PECE':
model_x = self.model_fn(x, t)
del model_prev_list[-1]
model_prev_list.append(model_x)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
t_prev_list.append(t)
for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
if lower_order_final:
predictor_order_used = min(predictor_order, steps - step + 1)
corrector_order_used = min(corrector_order, steps - step + 2)
else:
predictor_order_used = predictor_order
corrector_order_used = corrector_order
t = timesteps[step]
noise = torch.randn_like(x)
# predictor step
if skip_final_step and step == steps and not denoise_to_zero:
x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=0,
model_prev_list=model_prev_list, t_prev_list=t_prev_list,
noise=noise, t=t)
else:
x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list,
noise=noise, t=t)
# evaluation step
# do not evaluate if skip_final_step and step = steps
if not skip_final_step or step < steps:
model_x = self.model_fn(x_p, t)
# update model_list
# do not update if skip_final_step and step = steps
if not skip_final_step or step < steps:
model_prev_list.append(model_x)
# corrector step
# do not correct if skip_final_step and step = steps
if corrector_order > 0:
if not skip_final_step or step < steps:
x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
model_prev_list=model_prev_list, t_prev_list=t_prev_list,
noise=noise, t=t)
else:
x = x_p
else:
x = x_p
# evaluation step if mode = pece and step != steps
if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
model_x = self.model_fn(x, t)
del model_prev_list[-1]
model_prev_list.append(model_x)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step)
if return_intermediate:
intermediates.append(x)
t_prev_list.append(t)
del model_prev_list[0]
if denoise_to_zero:
t = torch.ones((1,)).to(device) * t_0
x = self.denoise_to_zero_fn(x, t)
if self.correcting_xt_fn is not None:
x = self.correcting_xt_fn(x, t, step + 1)
if return_intermediate:
intermediates.append(x)
if return_intermediate:
return x, intermediates
else:
return x
def sample(self, mode, x, tau, steps, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3,
corrector_order=4, pc_mode='PEC', return_intermediate=False
):
"""
For the PC-mode, please refer to the wiki page
https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
'few_steps' mode is recommended. The differences between 'few_steps' and 'more_steps' are as below:
1) 'few_steps' do not correct at final step and do not denoise to zero, while 'more_steps' do these two.
Thus the NFEs for 'few_steps' = steps, NFEs for 'more_steps' = steps + 2
For most of the experiments and tasks, we find these two operations do not have much help to sample quality.
2) 'few_steps' use a rescaling trick as in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
We find it will slightly improve the sample quality especially in few steps.
"""
assert mode in ['few_steps', 'more_steps'], "mode must be either 'few_steps' or 'more_steps'"
if mode == 'few_steps':
return self.sample_few_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
skip_order=skip_order, predictor_order=predictor_order,
corrector_order=corrector_order, pc_mode=pc_mode,
return_intermediate=return_intermediate)
else:
return self.sample_more_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
skip_order=skip_order, predictor_order=predictor_order,
corrector_order=corrector_order, pc_mode=pc_mode,
return_intermediate=return_intermediate)
#############################################################
# other utility functions
#############################################################
def interpolate_fn(x, xp, yp):
"""
A piecewise linear function y = f(x), using xp and yp as keypoints.
We implement f(x) in a differentiable way (i.e. applicable for autograd).
The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
Args:
x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
yp: PyTorch tensor with shape [C, K].
Returns:
The function values f(x), with shape [N, C].
"""
N, K = x.shape[0], xp.shape[1]
all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
sorted_all_x, x_indices = torch.sort(all_x, dim=2)
x_idx = torch.argmin(x_indices, dim=2)
cand_start_idx = x_idx - 1
start_idx = torch.where(
torch.eq(x_idx, 0),
torch.tensor(1, device=x.device),
torch.where(
torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
),
)
end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
start_idx2 = torch.where(
torch.eq(x_idx, 0),
torch.tensor(0, device=x.device),
torch.where(
torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
),
)
y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
return cand
def expand_dims(v, dims):
"""
Expand the tensor `v` to the dim `dims`.
Args:
`v`: a PyTorch tensor with shape [N].
`dim`: a `int`.
Returns:
a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
"""
return v[(...,) + (None,) * (dims - 1)]
\ No newline at end of file
# -*- coding: utf-8 -*-
import os
import re
import html
import urllib.parse as ul
import ftfy
import torch
from bs4 import BeautifulSoup
from transformers import T5EncoderModel, AutoTokenizer
from huggingface_hub import hf_hub_download
class T5Embedder:
available_models = ['t5-v1_1-xxl']
bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}') # noqa
def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120):
self.device = torch.device(device)
self.torch_dtype = torch_dtype or torch.bfloat16
if t5_model_kwargs is None:
t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
if use_offload_folder is not None:
t5_model_kwargs['offload_folder'] = use_offload_folder
t5_model_kwargs['device_map'] = {
'shared': self.device,
'encoder.embed_tokens': self.device,
'encoder.block.0': self.device,
'encoder.block.1': self.device,
'encoder.block.2': self.device,
'encoder.block.3': self.device,
'encoder.block.4': self.device,
'encoder.block.5': self.device,
'encoder.block.6': self.device,
'encoder.block.7': self.device,
'encoder.block.8': self.device,
'encoder.block.9': self.device,
'encoder.block.10': self.device,
'encoder.block.11': self.device,
'encoder.block.12': 'disk',
'encoder.block.13': 'disk',
'encoder.block.14': 'disk',
'encoder.block.15': 'disk',
'encoder.block.16': 'disk',
'encoder.block.17': 'disk',
'encoder.block.18': 'disk',
'encoder.block.19': 'disk',
'encoder.block.20': 'disk',
'encoder.block.21': 'disk',
'encoder.block.22': 'disk',
'encoder.block.23': 'disk',
'encoder.final_layer_norm': 'disk',
'encoder.dropout': 'disk',
}
else:
t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device}
self.use_text_preprocessing = use_text_preprocessing
self.hf_token = hf_token
self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_')
self.dir_or_name = dir_or_name
tokenizer_path, path = dir_or_name, dir_or_name
if local_cache:
cache_dir = os.path.join(self.cache_dir, dir_or_name)
tokenizer_path, path = cache_dir, cache_dir
elif dir_or_name in self.available_models:
cache_dir = os.path.join(self.cache_dir, dir_or_name)
for filename in [
'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin'
]:
hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir,
force_filename=filename, token=self.hf_token)
tokenizer_path, path = cache_dir, cache_dir
else:
cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl')
for filename in [
'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
]:
hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir,
force_filename=filename, token=self.hf_token)
tokenizer_path = cache_dir
print(tokenizer_path)
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
self.model_max_length = model_max_length
def get_text_embeddings(self, texts):
texts = [self.text_preprocessing(text) for text in texts]
text_tokens_and_mask = self.tokenizer(
texts,
max_length=self.model_max_length,
padding='max_length',
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors='pt'
)
text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids']
text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']
with torch.no_grad():
text_encoder_embs = self.model(
input_ids=text_tokens_and_mask['input_ids'].to(self.device),
attention_mask=text_tokens_and_mask['attention_mask'].to(self.device),
)['last_hidden_state'].detach()
return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device)
def text_preprocessing(self, text):
if self.use_text_preprocessing:
# The exact text cleaning as was in the training stage:
text = self.clean_caption(text)
text = self.clean_caption(text)
return text
else:
return text.lower().strip()
@staticmethod
def basic_clean(text):
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()
def clean_caption(self, caption):
caption = str(caption)
caption = ul.unquote_plus(caption)
caption = caption.strip().lower()
caption = re.sub('<person>', 'person', caption)
# urls:
caption = re.sub(
r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa
'', caption) # regex for urls
caption = re.sub(
r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))', # noqa
'', caption) # regex for urls
# html:
caption = BeautifulSoup(caption, features='html.parser').text
# @<nickname>
caption = re.sub(r'@[\w\d]+\b', '', caption)
# 31C0—31EF CJK Strokes
# 31F0—31FF Katakana Phonetic Extensions
# 3200—32FF Enclosed CJK Letters and Months
# 3300—33FF CJK Compatibility
# 3400—4DBF CJK Unified Ideographs Extension A
# 4DC0—4DFF Yijing Hexagram Symbols
# 4E00—9FFF CJK Unified Ideographs
caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)
caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)
caption = re.sub(r'[\u3200-\u32ff]+', '', caption)
caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
#######################################################
# все виды тире / all types of dash --> "-"
caption = re.sub(
r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+', # noqa
'-', caption)
# кавычки к одному стандарту
caption = re.sub(r'[`´«»“”¨]', '"', caption)
caption = re.sub(r'[‘’]', "'", caption)
# &quot;
caption = re.sub(r'&quot;?', '', caption)
# &amp
caption = re.sub(r'&amp', '', caption)
# ip adresses:
caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)
# article ids:
caption = re.sub(r'\d:\d\d\s+$', '', caption)
# \n
caption = re.sub(r'\\n', ' ', caption)
# "#123"
caption = re.sub(r'#\d{1,3}\b', '', caption)
# "#12345.."
caption = re.sub(r'#\d{5,}\b', '', caption)
# "123456.."
caption = re.sub(r'\b\d{6,}\b', '', caption)
# filenames:
caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)
#
caption = re.sub(r'[\"\']{2,}', r'"', caption) # """AUSVERKAUFT"""
caption = re.sub(r'[\.]{2,}', r' ', caption) # """AUSVERKAUFT"""
caption = re.sub(self.bad_punct_regex, r' ', caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
caption = re.sub(r'\s+\.\s+', r' ', caption) # " . "
# this-is-my-cute-cat / this_is_my_cute_cat
regex2 = re.compile(r'(?:\-|\_)')
if len(re.findall(regex2, caption)) > 3:
caption = re.sub(regex2, ' ', caption)
caption = self.basic_clean(caption)
caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption) # jc6640
caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption) # jc6640vc
caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption) # 6640vc231
caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)
caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)
caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)
caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)
caption = re.sub(r'\bpage\s+\d+\b', '', caption)
caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption) # j2d1a2a...
caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)
caption = re.sub(r'\b\s+\:\s+', r': ', caption)
caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
caption = re.sub(r'\s+', ' ', caption)
caption.strip()
caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
caption = re.sub(r'^\.\S+$', '', caption)
return caption.strip()
# Modified from OpenAI's diffusion repos
# GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
# ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
# IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
from abc import ABC, abstractmethod
import numpy as np
import torch as th
import torch.distributed as dist
def create_named_schedule_sampler(name, diffusion):
"""
Create a ScheduleSampler from a library of pre-defined samplers.
:param name: the name of the sampler.
:param diffusion: the diffusion object to sample for.
"""
if name == "uniform":
return UniformSampler(diffusion)
elif name == "loss-second-moment":
return LossSecondMomentResampler(diffusion)
else:
raise NotImplementedError(f"unknown schedule sampler: {name}")
class ScheduleSampler(ABC):
"""
A distribution over timesteps in the diffusion process, intended to reduce
variance of the objective.
By default, samplers perform unbiased importance sampling, in which the
objective's mean is unchanged.
However, subclasses may override sample() to change how the resampled
terms are reweighted, allowing for actual changes in the objective.
"""
@abstractmethod
def weights(self):
"""
Get a numpy array of weights, one per diffusion step.
The weights needn't be normalized, but must be positive.
"""
def sample(self, batch_size, device):
"""
Importance-sample timesteps for a batch.
:param batch_size: the number of timesteps.
:param device: the torch device to save to.
:return: a tuple (timesteps, weights):
- timesteps: a tensor of timestep indices.
- weights: a tensor of weights to scale the resulting losses.
"""
w = self.weights()
p = w / np.sum(w)
indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
indices = th.from_numpy(indices_np).long().to(device)
weights_np = 1 / (len(p) * p[indices_np])
weights = th.from_numpy(weights_np).float().to(device)
return indices, weights
class UniformSampler(ScheduleSampler):
def __init__(self, diffusion):
self.diffusion = diffusion
self._weights = np.ones([diffusion.num_timesteps])
def weights(self):
return self._weights
class LossAwareSampler(ScheduleSampler):
def update_with_local_losses(self, local_ts, local_losses):
"""
Update the reweighting using losses from a model.
Call this method from each rank with a batch of timesteps and the
corresponding losses for each of those timesteps.
This method will perform synchronization to make sure all of the ranks
maintain the exact same reweighting.
:param local_ts: an integer Tensor of timesteps.
:param local_losses: a 1D Tensor of losses.
"""
batch_sizes = [
th.tensor([0], dtype=th.int32, device=local_ts.device)
for _ in range(dist.get_world_size())
]
dist.all_gather(
batch_sizes,
th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
)
# Pad all_gather batches to be the maximum batch size.
batch_sizes = [x.item() for x in batch_sizes]
max_bs = max(batch_sizes)
timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes]
loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes]
dist.all_gather(timestep_batches, local_ts)
dist.all_gather(loss_batches, local_losses)
timesteps = [
x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
]
losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
self.update_with_all_losses(timesteps, losses)
@abstractmethod
def update_with_all_losses(self, ts, losses):
"""
Update the reweighting using losses from a model.
Sub-classes should override this method to update the reweighting
using losses from the model.
This method directly updates the reweighting without synchronizing
between workers. It is called by update_with_local_losses from all
ranks with identical arguments. Thus, it should have deterministic
behavior to maintain state across workers.
:param ts: a list of int timesteps.
:param losses: a list of float losses, one per timestep.
"""
class LossSecondMomentResampler(LossAwareSampler):
def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
self.diffusion = diffusion
self.history_per_term = history_per_term
self.uniform_prob = uniform_prob
self._loss_history = np.zeros(
[diffusion.num_timesteps, history_per_term], dtype=np.float64
)
self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
def weights(self):
if not self._warmed_up():
return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
weights /= np.sum(weights)
weights *= 1 - self.uniform_prob
weights += self.uniform_prob / len(weights)
return weights
def update_with_all_losses(self, ts, losses):
for t, loss in zip(ts, losses):
if self._loss_counts[t] == self.history_per_term:
# Shift out the oldest loss term.
self._loss_history[t, :-1] = self._loss_history[t, 1:]
self._loss_history[t, -1] = loss
else:
self._loss_history[t, self._loss_counts[t]] = loss
self._loss_counts[t] += 1
def _warmed_up(self):
return (self._loss_counts == self.history_per_term).all()
import os
import sys
import torch.nn as nn
from torch.utils.checkpoint import checkpoint, checkpoint_sequential
import torch.nn.functional as F
import torch
import torch.distributed as dist
import re
import math
from collections.abc import Iterable
from itertools import repeat
from torchvision import transforms as T
import random
from PIL import Image
def _ntuple(n):
def parse(x):
if isinstance(x, Iterable) and not isinstance(x, str):
return x
return tuple(repeat(x, n))
return parse
to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
assert isinstance(model, nn.Module)
def set_attr(module):
module.grad_checkpointing = True
module.fp32_attention = use_fp32_attention
module.grad_checkpointing_step = gc_step
model.apply(set_attr)
def auto_grad_checkpoint(module, *args, **kwargs):
if getattr(module, 'grad_checkpointing', False):
if not isinstance(module, Iterable):
return checkpoint(module, *args, **kwargs)
gc_step = module[0].grad_checkpointing_step
return checkpoint_sequential(module, gc_step, *args, **kwargs)
return module(*args, **kwargs)
def checkpoint_sequential(functions, step, input, *args, **kwargs):
# Hack for keyword-only parameter in a python 2.7-compliant way
preserve = kwargs.pop('preserve_rng_state', True)
if kwargs:
raise ValueError("Unexpected keyword arguments: " + ",".join(kwargs))
def run_function(start, end, functions):
def forward(input):
for j in range(start, end + 1):
input = functions[j](input, *args)
return input
return forward
if isinstance(functions, torch.nn.Sequential):
functions = list(functions.children())
# the last chunk has to be non-volatile
end = -1
segment = len(functions) // step
for start in range(0, step * (segment - 1), step):
end = start + step - 1
input = checkpoint(run_function(start, end, functions), input, preserve_rng_state=preserve)
return run_function(end + 1, len(functions) - 1, functions)(input)
def window_partition(x, window_size):
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = x.shape
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
if pad_h > 0 or pad_w > 0:
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
Hp, Wp = H + pad_h, W + pad_w
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows, (Hp, Wp)
def window_unpartition(windows, window_size, pad_hw, hw):
"""
Window unpartition into original sequences and removing padding.
Args:
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
window_size (int): window size.
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
H, W = hw
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
if Hp > H or Wp > W:
x = x[:, :H, :W, :].contiguous()
return x
def get_rel_pos(q_size, k_size, rel_pos):
"""
Get relative positional embeddings according to the relative positions of
query and key sizes.
Args:
q_size (int): size of query q.
k_size (int): size of key k.
rel_pos (Tensor): relative position embeddings (L, C).
Returns:
Extracted positional embeddings according to relative positions.
"""
max_rel_dist = int(2 * max(q_size, k_size) - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos_resized = F.interpolate(
rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
size=max_rel_dist,
mode="linear",
)
rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
else:
rel_pos_resized = rel_pos
# Scale the coords with short length if shapes for q and k are different.
q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
return rel_pos_resized[relative_coords.long()]
def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
q_h, q_w = q_size
k_h, k_w = k_size
Rh = get_rel_pos(q_h, k_h, rel_pos_h)
Rw = get_rel_pos(q_w, k_w, rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape(B, q_h, q_w, dim)
rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
attn = (
attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
).view(B, q_h * q_w, k_h * k_w)
return attn
def mean_flat(tensor):
return tensor.mean(dim=list(range(1, tensor.ndim)))
#################################################################################
# Token Masking and Unmasking #
#################################################################################
def get_mask(batch, length, mask_ratio, device, mask_type=None, data_info=None, extra_len=0):
"""
Get the binary mask for the input sequence.
Args:
- batch: batch size
- length: sequence length
- mask_ratio: ratio of tokens to mask
- data_info: dictionary with info for reconstruction
return:
mask_dict with following keys:
- mask: binary mask, 0 is keep, 1 is remove
- ids_keep: indices of tokens to keep
- ids_restore: indices to restore the original order
"""
assert mask_type in ['random', 'fft', 'laplacian', 'group']
mask = torch.ones([batch, length], device=device)
len_keep = int(length * (1 - mask_ratio)) - extra_len
if mask_type in ['random', 'group']:
noise = torch.rand(batch, length, device=device) # noise in [0, 1]
ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove
ids_restore = torch.argsort(ids_shuffle, dim=1)
# keep the first subset
ids_keep = ids_shuffle[:, :len_keep]
ids_removed = ids_shuffle[:, len_keep:]
elif mask_type in ['fft', 'laplacian']:
if 'strength' in data_info:
strength = data_info['strength']
else:
N = data_info['N'][0]
img = data_info['ori_img']
# 获取原图的尺寸信息
_, C, H, W = img.shape
if mask_type == 'fft':
# 对图片进行reshape,将其变为patch (3, H/N, N, W/N, N)
reshaped_image = img.reshape((batch, -1, H // N, N, W // N, N))
fft_image = torch.fft.fftn(reshaped_image, dim=(3, 5))
# 取绝对值并求和获取频率强度
strength = torch.sum(torch.abs(fft_image), dim=(1, 3, 5)).reshape((batch, -1,))
elif type == 'laplacian':
laplacian_kernel = torch.tensor([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=torch.float32).reshape(1, 1, 3, 3)
laplacian_kernel = laplacian_kernel.repeat(C, 1, 1, 1)
# 对图片进行reshape,将其变为patch (3, H/N, N, W/N, N)
reshaped_image = img.reshape(-1, C, H // N, N, W // N, N).permute(0, 2, 4, 1, 3, 5).reshape(-1, C, N, N)
laplacian_response = F.conv2d(reshaped_image, laplacian_kernel, padding=1, groups=C)
strength = laplacian_response.sum(dim=[1, 2, 3]).reshape((batch, -1,))
# 对频率强度进行归一化,然后使用torch.multinomial进行采样
probabilities = strength / (strength.max(dim=1)[0][:, None]+1e-5)
ids_shuffle = torch.multinomial(probabilities.clip(1e-5, 1), length, replacement=False)
ids_keep = ids_shuffle[:, :len_keep]
ids_restore = torch.argsort(ids_shuffle, dim=1)
ids_removed = ids_shuffle[:, len_keep:]
mask[:, :len_keep] = 0
mask = torch.gather(mask, dim=1, index=ids_restore)
return {'mask': mask,
'ids_keep': ids_keep,
'ids_restore': ids_restore,
'ids_removed': ids_removed}
def mask_out_token(x, ids_keep, ids_removed=None):
"""
Mask out the tokens specified by ids_keep.
Args:
- x: input sequence, [N, L, D]
- ids_keep: indices of tokens to keep
return:
- x_masked: masked sequence
"""
N, L, D = x.shape # batch, length, dim
x_remain = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
if ids_removed is not None:
x_masked = torch.gather(x, dim=1, index=ids_removed.unsqueeze(-1).repeat(1, 1, D))
return x_remain, x_masked
else:
return x_remain
def mask_tokens(x, mask_ratio):
"""
Perform per-sample random masking by per-sample shuffling.
Per-sample shuffling is done by argsort random noise.
x: [N, L, D], sequence
"""
N, L, D = x.shape # batch, length, dim
len_keep = int(L * (1 - mask_ratio))
noise = torch.rand(N, L, device=x.device) # noise in [0, 1]
# sort noise for each sample
ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove
ids_restore = torch.argsort(ids_shuffle, dim=1)
# keep the first subset
ids_keep = ids_shuffle[:, :len_keep]
x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
# generate the binary mask: 0 is keep, 1 is remove
mask = torch.ones([N, L], device=x.device)
mask[:, :len_keep] = 0
mask = torch.gather(mask, dim=1, index=ids_restore)
return x_masked, mask, ids_restore
def unmask_tokens(x, ids_restore, mask_token):
# x: [N, T, D] if extras == 0 (i.e., no cls token) else x: [N, T+1, D]
mask_tokens = mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
x = torch.cat([x, mask_tokens], dim=1)
x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle
return x
# Parse 'None' to None and others to float value
def parse_float_none(s):
assert isinstance(s, str)
return None if s == 'None' else float(s)
#----------------------------------------------------------------------------
# Parse a comma separated list of numbers or ranges and return a list of ints.
# Example: '1,2,5-10' returns [1, 2, 5, 6, 7, 8, 9, 10]
def parse_int_list(s):
if isinstance(s, list): return s
ranges = []
range_re = re.compile(r'^(\d+)-(\d+)$')
for p in s.split(','):
if m := range_re.match(p):
ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
else:
ranges.append(int(p))
return ranges
def init_processes(fn, args):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = args.master_address
os.environ['MASTER_PORT'] = str(random.randint(2000, 6000))
print(f'MASTER_ADDR = {os.environ["MASTER_ADDR"]}')
print(f'MASTER_PORT = {os.environ["MASTER_PORT"]}')
torch.cuda.set_device(args.local_rank)
dist.init_process_group(backend='nccl', init_method='env://', rank=args.global_rank, world_size=args.global_size)
fn(args)
if args.global_size > 1:
cleanup()
def mprint(*args, **kwargs):
"""
Print only from rank 0.
"""
if dist.get_rank() == 0:
print(*args, **kwargs)
def cleanup():
"""
End DDP training.
"""
dist.barrier()
mprint("Done!")
dist.barrier()
dist.destroy_process_group()
#----------------------------------------------------------------------------
# logging info.
class Logger(object):
"""
Redirect stderr to stdout, optionally print stdout to a file,
and optionally force flushing on both stdout and the file.
"""
def __init__(self, file_name=None, file_mode="w", should_flush=True):
self.file = None
if file_name is not None:
self.file = open(file_name, file_mode)
self.should_flush = should_flush
self.stdout = sys.stdout
self.stderr = sys.stderr
sys.stdout = self
sys.stderr = self
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def write(self, text):
"""Write text to stdout (and a file) and optionally flush."""
if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
return
if self.file is not None:
self.file.write(text)
self.stdout.write(text)
if self.should_flush:
self.flush()
def flush(self):
"""Flush written text to both stdout and a file, if open."""
if self.file is not None:
self.file.flush()
self.stdout.flush()
def close(self):
"""Flush, close possible files, and remove stdout/stderr mirroring."""
self.flush()
# if using multiple loggers, prevent closing in wrong order
if sys.stdout is self:
sys.stdout = self.stdout
if sys.stderr is self:
sys.stderr = self.stderr
if self.file is not None:
self.file.close()
class StackedRandomGenerator:
def __init__(self, device, seeds):
super().__init__()
self.generators = [torch.Generator(device).manual_seed(int(seed) % (1 << 32)) for seed in seeds]
def randn(self, size, **kwargs):
assert size[0] == len(self.generators)
return torch.stack([torch.randn(size[1:], generator=gen, **kwargs) for gen in self.generators])
def randn_like(self, input):
return self.randn(input.shape, dtype=input.dtype, layout=input.layout, device=input.device)
def randint(self, *args, size, **kwargs):
assert size[0] == len(self.generators)
return torch.stack([torch.randint(*args, size=size[1:], generator=gen, **kwargs) for gen in self.generators])
def prepare_prompt_ar(prompt, ratios, device='cpu', show=True):
# get aspect_ratio or ar
aspect_ratios = re.findall(r"--aspect_ratio\s+(\d+:\d+)", prompt)
ars = re.findall(r"--ar\s+(\d+:\d+)", prompt)
custom_hw = re.findall(r"--hw\s+(\d+:\d+)", prompt)
if show:
print("aspect_ratios:", aspect_ratios, "ars:", ars, "hws:", custom_hw)
prompt_clean = prompt.split("--aspect_ratio")[0].split("--ar")[0].split("--hw")[0]
if len(aspect_ratios) + len(ars) + len(custom_hw) == 0 and show:
print( "Wrong prompt format. Set to default ar: 1. change your prompt into format '--ar h:w or --hw h:w' for correct generating")
if len(aspect_ratios) != 0:
ar = float(aspect_ratios[0].split(':')[0]) / float(aspect_ratios[0].split(':')[1])
elif len(ars) != 0:
ar = float(ars[0].split(':')[0]) / float(ars[0].split(':')[1])
else:
ar = 1.
closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
if len(custom_hw) != 0:
custom_hw = [float(custom_hw[0].split(':')[0]), float(custom_hw[0].split(':')[1])]
else:
custom_hw = ratios[closest_ratio]
default_hw = ratios[closest_ratio]
prompt_show = f'prompt: {prompt_clean.strip()}\nSize: --ar {closest_ratio}, --bin hw {ratios[closest_ratio]}, --custom hw {custom_hw}'
return prompt_clean, prompt_show, torch.tensor(default_hw, device=device)[None], torch.tensor([float(closest_ratio)], device=device)[None], torch.tensor(custom_hw, device=device)[None]
def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int):
orig_hw = torch.tensor([samples.shape[2], samples.shape[3]], dtype=torch.int)
custom_hw = torch.tensor([int(new_height), int(new_width)], dtype=torch.int)
if (orig_hw != custom_hw).all():
ratio = max(custom_hw[0] / orig_hw[0], custom_hw[1] / orig_hw[1])
resized_width = int(orig_hw[1] * ratio)
resized_height = int(orig_hw[0] * ratio)
transform = T.Compose([
T.Resize((resized_height, resized_width)),
T.CenterCrop(custom_hw.tolist())
])
return transform(samples)
else:
return samples
def resize_and_crop_img(img: Image, new_width, new_height):
orig_width, orig_height = img.size
ratio = max(new_width/orig_width, new_height/orig_height)
resized_width = int(orig_width * ratio)
resized_height = int(orig_height * ratio)
img = img.resize((resized_width, resized_height), Image.LANCZOS)
left = (resized_width - new_width)/2
top = (resized_height - new_height)/2
right = (resized_width + new_width)/2
bottom = (resized_height + new_height)/2
img = img.crop((left, top, right, bottom))
return img
def mask_feature(emb, mask):
if emb.shape[0] == 1:
keep_index = mask.sum().item()
return emb[:, :, :keep_index, :], keep_index
else:
masked_feature = emb * mask[:, None, :, None]
return masked_feature, emb.shape[2]
\ No newline at end of file
"""SAMPLING ONLY."""
import torch
import numpy as np
from diffusion.model.sa_solver import NoiseScheduleVP, model_wrapper, SASolver
from .model import gaussian_diffusion as gd
class SASolverSampler(object):
def __init__(self, model,
noise_schedule="linear",
diffusion_steps=1000,
device='cpu',
):
super().__init__()
self.model = model
self.device = device
to_torch = lambda x: x.clone().detach().to(torch.float32).to(device)
betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
alphas = 1.0 - betas
self.register_buffer('alphas_cumprod', to_torch(np.cumprod(alphas, axis=0)))
def register_buffer(self, name, attr):
if type(attr) == torch.Tensor and attr.device != torch.device("cuda"):
attr = attr.to(torch.device("cuda"))
setattr(self, name, attr)
@torch.no_grad()
def sample(self, S, batch_size, shape, conditioning=None, callback=None, normals_sequence=None, img_callback=None, quantize_x0=False, eta=0., mask=None, x0=None, temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, verbose=True, x_T=None, log_every_t=100, unconditional_guidance_scale=1., unconditional_conditioning=None, model_kwargs=None, **kwargs):
if model_kwargs is None:
model_kwargs = {}
if conditioning is not None:
if isinstance(conditioning, dict):
cbs = conditioning[list(conditioning.keys())[0]].shape[0]
if cbs != batch_size:
print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
elif conditioning.shape[0] != batch_size:
print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
# sampling
C, H, W = shape
size = (batch_size, C, H, W)
device = self.device
img = torch.randn(size, device=device) if x_T is None else x_T
ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
model_fn = model_wrapper(
self.model,
ns,
model_type="noise",
guidance_type="classifier-free",
condition=conditioning,
unconditional_condition=unconditional_conditioning,
guidance_scale=unconditional_guidance_scale,
model_kwargs=model_kwargs,
)
sasolver = SASolver(model_fn, ns, algorithm_type="data_prediction")
tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0
x = sasolver.sample(mode='few_steps', x=img, tau=tau_t, steps=S, skip_type='time', skip_order=1, predictor_order=2, corrector_order=2, pc_mode='PEC', return_intermediate=False)
return x.to(device), None
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment