pixart-alpha

e2364931 · mashun1 · e2364931 · e2364931 · e2364931 · e2364931
Commit e2364931 authored Apr 27, 2024 by mashun1
20 changed files
--- a/diffusion/model/hed.py
+++ b/diffusion/model/hed.py
+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+import sys
+from pathlib import Path
+current_file_path = Path(__file__).resolve()
+sys.path.insert(0, str(current_file_path.parent.parent.parent))
+from torch import nn
+import torch
+import numpy as np
+from torchvision import transforms as T
+from tqdm import tqdm
+from torch.utils.data import Dataset, DataLoader
+import json
+from PIL import Image
+import torchvision.transforms.functional as TF
+from accelerate import Accelerator
+from diffusers.models import AutoencoderKL
+import os
+
+image_resize = 1024
+
+
+class DoubleConvBlock(nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+
+    def forward(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+
+
+class ControlNetHED_Apache2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+
+    def forward(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5
+
+
+class InternData(Dataset):
+    def __init__(self):
+        ####
+        with open('data/InternData/partition/data_info.json', 'r') as f:
+            self.j = json.load(f)
+        self.transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB')),
+            T.Resize(image_resize),  # Image.BICUBIC
+            T.CenterCrop(image_resize),
+            T.ToTensor(),
+        ])
+
+    def __len__(self):
+        return len(self.j)
+
+    def getdata(self, idx):
+
+        path = self.j[idx]['path']
+        image = Image.open("data/InternImgs/" + path)
+        image = self.transform(image)
+        return image, path
+
+    def __getitem__(self, idx):
+        for i in range(20):
+            try:
+                data = self.getdata(idx)
+                return data
+            except Exception as e:
+                print(f"Error details: {str(e)}")
+                idx = np.random.randint(len(self))
+        raise RuntimeError('Too many bad data.')
+
+class HEDdetector(nn.Module):
+    def __init__(self, feature=True, vae=None):
+        super().__init__()
+        self.model = ControlNetHED_Apache2()
+        self.model.load_state_dict(torch.load('output/pretrained_models/ControlNetHED.pth', map_location='cpu'))
+        self.model.eval()
+        self.model.requires_grad_(False)
+        if feature:
+            if vae is None:
+                self.vae = AutoencoderKL.from_pretrained("output/pretrained_models/sd-vae-ft-ema")
+            else:
+                self.vae = vae
+            self.vae.eval()
+            self.vae.requires_grad_(False)
+        else:
+            self.vae = None
+
+    def forward(self, input_image):
+        B, C, H, W = input_image.shape
+        with torch.inference_mode():
+            edges = self.model(input_image * 255.)
+            edges = torch.cat([TF.resize(e, [H, W]) for e in edges], dim=1)
+            edge = 1 / (1 + torch.exp(-torch.mean(edges, dim=1, keepdim=True)))
+            edge.clip_(0, 1)
+            if self.vae:
+                edge = TF.normalize(edge, [.5], [.5])
+                edge = edge.repeat(1, 3, 1, 1)
+                posterior = self.vae.encode(edge).latent_dist
+                edge = torch.cat([posterior.mean, posterior.std], dim=1).cpu().numpy()
+        return edge
+
+
+def main():
+    dataset = InternData()
+    dataloader = DataLoader(dataset, batch_size=10, shuffle=False, num_workers=8, pin_memory=True)
+    hed = HEDdetector()
+
+    accelerator = Accelerator()
+    hed, dataloader = accelerator.prepare(hed, dataloader)
+
+
+    for img, path in tqdm(dataloader):
+        out = hed(img.cuda())
+        for p, o in zip(path, out):
+            save = f'data/InternalData/hed_feature_{image_resize}/' + p.replace('.png', '.npz')
+            if os.path.exists(save):
+                continue
+            os.makedirs(os.path.dirname(save), exist_ok=True)
+            np.savez_compressed(save, o)
+
+
+if __name__ == "__main__":
+    main()
--- a/diffusion/model/llava/__init__.py
+++ b/diffusion/model/llava/__init__.py
+from diffusion.model.llava.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
\ No newline at end of file
--- a/diffusion/model/llava/llava_mpt.py
+++ b/diffusion/model/llava/llava_mpt.py
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+
+import math
+
+from transformers import AutoConfig, AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor
+
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+
+from diffusion.model.llava.mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel
+
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+
+class LlavaMPTConfig(MPTConfig):
+    model_type = "llava_mpt"
+
+
+class LlavaMPTModel(MPTModel):
+    config_class = LlavaMPTConfig
+
+    def __init__(self, config: MPTConfig, mm_vision_tower=None, mm_hidden_size=None):
+        super(LlavaMPTModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            # HACK: for FSDP
+            self.vision_tower = [CLIPVisionModel.from_pretrained(config.mm_vision_tower)]
+            # self.vision_tower = CLIPVisionModel.from_pretrained(config.mm_vision_tower)
+
+        if hasattr(config, "use_mm_proj"):
+            self.mm_projector = nn.Linear(config.mm_hidden_size, config.d_model)
+
+    def initialize_vision_modules(self, vision_tower, mm_vision_select_layer,
+                                  pretrain_mm_mlp_adapter=None, tune_mm_mlp_adapter=False):
+        self.config.mm_vision_tower = vision_tower
+
+        image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
+
+        if not hasattr(self, 'vision_tower'):
+            vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
+        else:
+            vision_tower = self.vision_tower[0]
+        vision_tower.requires_grad_(False)
+        vision_tower = vision_tower.to(torch.float16)
+        self.vision_tower = [vision_tower]
+
+        vision_config = vision_tower.config
+        num_patches = (vision_config.image_size // vision_config.patch_size) ** 2
+
+        self.config.use_mm_proj = True
+        self.config.mm_hidden_size = vision_config.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+
+        if not hasattr(self, 'mm_projector'):
+            self.mm_projector = nn.Linear(vision_config.hidden_size, self.config.d_model)
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            self.mm_projector.load_state_dict({k.split('.')[-1]: v for k, v in mm_projector_weights.items() if 'mm_projector' in k})
+
+        return dict(
+            image_processor=image_processor,
+            image_token_len=num_patches,
+            vision_config=vision_config
+        )
+
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
+
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+        # if orig_embeds_params is not None:
+        #     orig_embeds_params = orig_embeds_params[0]
+        #     with torch.no_grad():
+        #         self.get_input_embeddings().weight.data[:-2] = orig_embeds_params[:-2].data
+
+        inputs_embeds = self.wte(input_ids)
+
+        vision_tower = getattr(self, 'vision_tower', None)
+        if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
+            # TODO: this is a modified multimodal LLM -- Haotian Liu
+            vision_tower = vision_tower[0]  # HACK: for FSDP
+            with torch.no_grad():
+                if type(images) is list:
+                    # variable length images
+                    image_features = []
+                    for image in images:
+                        image_forward_out = vision_tower(image.unsqueeze(0), output_hidden_states=True)
+                        select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
+                        select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
+                        image_feature = select_hidden_state[:, 1:]
+                        image_features.append(image_feature)
+                else:
+                    image_forward_outs = vision_tower(images, output_hidden_states=True)
+                    select_hidden_state_layer = getattr(self.config, "mm_vision_select_layer", -1)
+                    select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
+                    image_features = select_hidden_state[:, 1:]
+            if type(images) is list:
+                image_features = [self.mm_projector(image_feature)[0] for image_feature in image_features]
+            else:
+                image_features = self.mm_projector(image_features)
+            dummy_image_features = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+            dummy_image_features = self.mm_projector(dummy_image_features)
+
+            new_input_embeds = []
+            cur_image_idx = 0
+            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
+                if (cur_input_ids == vision_tower.config.im_patch_token).sum() == 0:
+                    # multimodal LLM, but the current sample is not multimodal
+                    cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
+                    new_input_embeds.append(cur_input_embeds)
+                    continue
+                cur_image_features = image_features[cur_image_idx]
+                num_patches = cur_image_features.shape[0]
+                if vision_tower.config.use_im_start_end:
+                    if (cur_input_ids == vision_tower.config.im_start_token).sum() != (cur_input_ids == vision_tower.config.im_end_token).sum():
+                        raise ValueError("The number of image start tokens and image end tokens should be the same.")
+                    image_start_tokens = torch.where(cur_input_ids == vision_tower.config.im_start_token)[0]
+                    for image_start_token_pos in image_start_tokens:
+                        cur_image_features = image_features[cur_image_idx].to(device=cur_input_embeds.device)
+                        num_patches = cur_image_features.shape[0]
+                        if cur_input_ids[image_start_token_pos + num_patches + 1] != vision_tower.config.im_end_token:
+                            raise ValueError("The image end token should follow the image start token.")
+                        if orig_embeds_params is not None:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                        else:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                        cur_image_idx += 1
+                else:
+                    if (cur_input_ids == vision_tower.config.im_patch_token).sum() != num_patches:
+                        raise ValueError("The number of image patch tokens should be the same as the number of image patches.")
+                    masked_indices = torch.where(cur_input_ids == vision_tower.config.im_patch_token)[0]
+                    mask_index_start = masked_indices[0]
+                    if (masked_indices != torch.arange(mask_index_start, mask_index_start+num_patches, device=masked_indices.device, dtype=masked_indices.dtype)).any():
+                        raise ValueError("The image patch tokens should be consecutive.")
+                    if orig_embeds_params is not None:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start].detach(), cur_image_features, cur_input_embeds[mask_index_start+num_patches:].detach()), dim=0)
+                    else:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:mask_index_start], cur_image_features, cur_input_embeds[mask_index_start+num_patches:]), dim=0)
+                new_input_embeds.append(cur_new_input_embeds)
+            inputs_embeds = torch.stack(new_input_embeds, dim=0)
+
+        return super(LlavaMPTModel, self).forward(input_ids=None, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, tok_emb=inputs_embeds)
+
+
+class LlavaMPTForCausalLM(MPTForCausalLM):
+    config_class = LlavaMPTConfig
+    supports_gradient_checkpointing = True
+
+    def __init__(self, config):
+        super(MPTForCausalLM, self).__init__(config)
+
+        if not config.tie_word_embeddings:
+            raise ValueError('MPTForCausalLM only supports tied word embeddings')
+        self.transformer = LlavaMPTModel(config)
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+            self.logit_scale = logit_scale
+
+    def get_model(self):
+        return self.transformer
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlavaMPTModel):
+            module.gradient_checkpointing = value
+
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, images=None):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, images=images)
+        logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            labels = torch.roll(labels, shifts=-1)
+            labels[:, -1] = -100
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        if inputs_embeds is not None:
+            raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('MPT does not support generation with right padding.')
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get('use_cache') == False:
+                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
+        else:
+            prefix_mask = None
+        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True), "images": kwargs.get("images", None)}
+
+    def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
+                                    tune_mm_mlp_adapter=False, pretrain_mm_mlp_adapter=None):
+        vision_config = self.get_model().vision_tower[0].config
+        vision_config.use_im_start_end = mm_use_im_start_end
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        self.resize_token_embeddings(len(tokenizer))
+
+        if mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+
+            if num_new_tokens > 0:
+                input_embeddings = (
+                    self._extracted_from_initialize_vision_tokenizer_14(
+                        num_new_tokens
+                    )
+                )
+            if tune_mm_mlp_adapter:
+                self.get_model().orig_embeds_params = [self.get_input_embeddings().weight.data.clone().to(device=device)]
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['transformer.wte.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+
+        vision_config.im_patch_token = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_PATCH_TOKEN])[0]
+
+    # TODO Rename this here and in `initialize_vision_tokenizer`
+    def _extracted_from_initialize_vision_tokenizer_14(self, num_new_tokens):
+        result = self.get_input_embeddings().weight.data
+        output_embeddings = self.get_output_embeddings().weight.data
+
+        input_embeddings_avg = result[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+
+        result[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+        return result
+
+AutoConfig.register("llava_mpt", LlavaMPTConfig)
+AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)
--- a/diffusion/model/llava/mpt/attention.py
+++ b/diffusion/model/llava/mpt/attention.py
+"""Attention layers."""
+import math
+import warnings
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch import nn
+from .norm import LPLayerNorm
+
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
+        else:
+            return False
+    return original_is_causal
+
+def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
+    min_val = torch.finfo(q.dtype).min
+    (b, _, s_q, d) = q.shape
+    s_k = k.size(-1)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        if attn_bias.size(-1) not in [1, s_k] or attn_bias.size(-2) not in [
+            1,
+            s_q,
+        ]:
+            raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
+        attn_weight = attn_weight + attn_bias
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
+        attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal:
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
+    out = attn_weight.matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    return (out, attn_weight) if needs_weights else (out, None)
+
+def check_valid_inputs(*tensors, valid_dtypes=None):
+    if valid_dtypes is None:
+        valid_dtypes = [torch.float16, torch.bfloat16]
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
+
+def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except:
+        raise RuntimeError('Please install flash-attn==1.0.3.post0')
+    check_valid_inputs(query, key, value)
+    if attn_bias is not None:
+        raise NotImplementedError('attn_bias not implemented for flash attn.')
+    (batch_size, seqlen) = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
+    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
+    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
+    if multiquery:
+        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
+        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
+    output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return (output, None)
+
+def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
+    try:
+        from flash_attn import flash_attn_triton
+    except:
+        raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
+    check_valid_inputs(query, key, value)
+    if dropout_p:
+        raise NotImplementedError('Dropout not implemented for attn_impl: triton.')
+    if needs_weights:
+        raise NotImplementedError('attn_impl: triton cannot return attn weights.')
+    if key_padding_mask is not None:
+        warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
+        (b_size, s_k) = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
+    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
+    key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
+    value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
+    if multiquery:
+        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
+        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return (output, None)
+
+class MultiheadAttention(nn.Module):
+    """Multi-head self attention.
+
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
+        fuse_splits = (d_model, 2 * d_model)
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(self.d_model, device=device)
+            self.k_ln = layernorm_class(self.d_model, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available():
+                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
+        else:
+            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True
+
+    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.chunk(3, dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                key = torch.cat([past_key_value[0], key], dim=1)
+                value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = (key, value)
+        if attn_bias is not None:
+            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
+        return (self.out_proj(context), attn_weights, past_key_value)
+
+class MultiQueryAttention(nn.Module):
+    """Multi-Query self attention.
+
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+
+    def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.head_dim)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
+        fuse_splits = (d_model, d_model + self.head_dim)
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(d_model, device=device)
+            self.k_ln = layernorm_class(self.head_dim, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+            warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available():
+                warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
+        else:
+            raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True
+
+    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        if past_key_value is not None:
+            if len(past_key_value) != 0:
+                key = torch.cat([past_key_value[0], key], dim=1)
+                value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = (key, value)
+        if attn_bias is not None:
+            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
+        return (self.out_proj(context), attn_weights, past_key_value)
+
+def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+
+def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            (device, dtype) = (attn_bias.device, attn_bias.dtype)
+            attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
+        return attn_bias
+    else:
+        raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
+
+def gen_slopes(n_heads, alibi_bias_max=8, device=None):
+    _n_heads = 2 ** math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = 1.0 / torch.pow(2, m)
+    if _n_heads != n_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    return slopes.view(1, n_heads, 1, 1)
+
+def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
+    if full:
+        alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
\ No newline at end of file
--- a/diffusion/model/llava/mpt/blocks.py
+++ b/diffusion/model/llava/mpt/blocks.py
+"""GPT Blocks used for the GPT Model."""
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
+
+class MPTMLP(nn.Module):
+
+    def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
+        super().__init__()
+        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
+        self.act = nn.GELU(approximate='none')
+        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
+        self.down_proj._is_residual = True
+
+    def forward(self, x):
+        return self.down_proj(self.act(self.up_proj(x)))
+
+class MPTBlock(nn.Module):
+
+    def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict = None, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
+        if attn_config is None:
+            attn_config = {
+                'attn_type': 'multihead_attention',
+                'attn_pdrop': 0.0,
+                'attn_impl': 'triton',
+                'qk_ln': False,
+                'clip_qkv': None,
+                'softmax_scale': None,
+                'prefix_lm': False,
+                'attn_uses_sequence_id': False,
+                'alibi': False,
+                'alibi_bias_max': 8,
+            }
+        del kwargs
+        super().__init__()
+        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
+        self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, device=device)
+        self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+
+    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
+        a = self.norm_1(x)
+        (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
+        x = x + self.resid_attn_dropout(b)
+        m = self.norm_2(x)
+        n = self.ffn(m)
+        x = x + self.resid_ffn_dropout(n)
+        return (x, past_key_value)
\ No newline at end of file
--- a/diffusion/model/llava/mpt/configuration_mpt.py
+++ b/diffusion/model/llava/mpt/configuration_mpt.py
+"""A HuggingFace-style model configuration."""
+from typing import Dict, Optional, Union
+from transformers import PretrainedConfig
+attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
+init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}
+
+class MPTConfig(PretrainedConfig):
+    model_type = 'mpt'
+
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
+        """The MPT configuration class.
+
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            expansion_ratio (int): The ratio of the up/down scale in the MLP.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict):  A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
+                attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                    this value.
+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                    use the default scale of ``1/sqrt(d_keys)``.
+                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
+                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
+                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                    which sub-sequence each token belongs to.
+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
+                alibi_bias_max (int): The maximum value of the alibi bias.
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            verbose (int): The verbosity level. 0 is silent.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            norm_type (str): choose type of norm to use
+            multiquery_attention (bool): Whether to use multiquery attention implementation.
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+            init_config (Dict): A dictionary used to configure the model initialization:
+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                    if using the baseline_ parameter initialization scheme.
+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+                ---
+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        super().__init__(**kwargs)
+        self._validate_config()
+
+    def _set_config_defaults(self, config, config_defaults):
+        for (k, v) in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+
+    def _validate_config(self):
+        self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
+        self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
+            raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError('alibi only implemented with torch and triton attention.')
+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
+        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+        if self.init_config.get('name', None) is None:
+            raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
+        if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            raise ValueError(
+                'Positional information must be provided to the model using either learned_pos_emb or alibi.'
+            )
\ No newline at end of file
--- a/diffusion/model/llava/mpt/modeling_mpt.py
+++ b/diffusion/model/llava/mpt/modeling_mpt.py
+"""A simple, flexible implementation of a GPT model.
+
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from .attention import attn_bias_shape, build_attn_bias
+from .blocks import MPTBlock
+from .norm import NORM_CLASS_REGISTRY
+from .configuration_mpt import MPTConfig
+from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+
+class MPTPreTrainedModel(PreTrainedModel):
+    config_class = MPTConfig
+    base_model_prefix = 'model'
+
+class MPTModel(MPTPreTrainedModel):
+
+    def __init__(self, config: MPTConfig):
+        config._validate_config()
+        super().__init__(config)
+        self.attn_impl = config.attn_config['attn_impl']
+        self.prefix_lm = config.attn_config['prefix_lm']
+        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
+        self.alibi = config.attn_config['alibi']
+        self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
+            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
+            raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
+        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
+        self.embedding_fraction = config.embedding_fraction
+        self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
+        if not self.alibi:
+            self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
+        self.emb_drop = nn.Dropout(config.emb_pdrop)
+        self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
+        self.norm_f = norm_class(config.d_model, device=config.init_device)
+        if config.init_device != 'meta':
+            self.apply(self.param_init_fn)
+        self.is_causal = not self.prefix_lm
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
+                    if config.verbose:
+                        warnings.warn(f'Removing bias ({module.bias}) from {module}.')
+                    module.register_parameter('bias', None)
+        if config.verbose and config.verbose > 2:
+            print(self)
+        if 'verbose' not in self.config.init_config:
+            self.config.init_config['verbose'] = self.config.verbose
+        if self.config.init_config['verbose'] > 1:
+            init_fn_name = self.config.init_config['name']
+            warnings.warn(f'Using {init_fn_name} initialization.')
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, value):
+        self.wte = value
+
+    @torch.no_grad()
+    def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
+                self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
+            self._attn_bias_initialized = True
+        if self.attn_impl == 'flash':
+            return (self.attn_bias, attention_mask)
+        if self.attn_bias is not None:
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)
+            assert isinstance(prefix_mask, torch.Tensor)
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)
+            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
+            else:
+                attn_bias = attn_bias[:, :, :, -s_k:]
+            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
+                raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
+        return (attn_bias, None)
+
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
+        (s_k, s_q) = attn_bias.shape[-2:]
+        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
+            raise ValueError(
+                f'attn_bias does not match the expected shape. The last two dimensions should both be {self.config.max_length} '
+                + f'but are {s_k} and {s_q}.'
+            )
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)
+
+    def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
+        seq_len = sequence_id.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
+        return self._extracted_from__apply_sequence_id_15(attn_bias, cannot_attend)
+
+    # TODO Rename this here and in `_apply_prefix_mask` and `_apply_sequence_id`
+    def _extracted_from__apply_sequence_id_15(self, attn_bias, cannot_attend):
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, tok_emb: Optional[torch.FloatTensor]=None):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if prefix_mask is not None:
+            prefix_mask = prefix_mask.bool()
+        if not return_dict:
+            raise NotImplementedError('return_dict False is not implemented yet for MPT')
+        if output_attentions:
+            raise NotImplementedError('output_attentions is not implemented yet for MPT')
+        if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
+            raise NotImplementedError('MPT does not support training with left padding.')
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
+            elif self.attn_uses_sequence_id is False and sequence_id is not None:
+                warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
+        if input_ids is not None:
+            S = input_ids.size(1)
+            assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
+            tok_emb = self.wte(input_ids)
+        else:
+            assert tok_emb is not None
+            S = tok_emb.size(1)
+        if self.alibi:
+            x = tok_emb
+        else:
+            past_position = 0
+            if past_key_values is not None:
+                if len(past_key_values) != self.config.n_layers:
+                    raise ValueError(
+                        f'past_key_values must provide a past_key_value for each attention layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).'
+                    )
+                past_position = past_key_values[0][0].size(1)
+            if S + past_position > self.config.max_seq_len:
+                raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
+            pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
+            if attention_mask is not None:
+                pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
+            pos_emb = self.wpe(pos)
+            x = tok_emb + pos_emb
+        if self.embedding_fraction == 1:
+            x = self.emb_drop(x)
+        else:
+            x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
+            assert isinstance(self.emb_drop, nn.Module)
+            x = self.emb_drop(x_shrunk)
+        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)]
+        all_hidden_states = () if output_hidden_states else None
+        for (b_idx, block) in enumerate(self.blocks):
+            if output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                (x, past_key_value) = torch.utils.checkpoint.checkpoint(
+                    block,
+                    x, past_key_value, attn_bias, attention_mask, self.is_causal
+                )
+            else:
+                (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
+            if past_key_values is not None:
+                past_key_values[b_idx] = past_key_value
+        x = self.norm_f(x)
+        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
+
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+
+class MPTForCausalLM(MPTPreTrainedModel):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__(config)
+        if not config.tie_word_embeddings:
+            raise ValueError('MPTForCausalLM only supports tied word embeddings')
+        self.transformer = MPTModel(config)
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+            self.logit_scale = logit_scale
+
+    def get_input_embeddings(self):
+        return self.transformer.wte
+
+    def set_input_embeddings(self, value):
+        self.transformer.wte = value
+
+    def get_output_embeddings(self):
+        return self.transformer.wte
+
+    def set_output_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.transformer = decoder
+
+    def get_decoder(self):
+        return self.transformer
+
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
+        logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            labels = torch.roll(labels, shifts=-1)
+            labels[:, -1] = -100
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
+
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
+
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        if inputs_embeds is not None:
+            raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError('MPT does not support generation with right padding.')
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get('use_cache') == False:
+                raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
+        else:
+            prefix_mask = None
+        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        """Used by HuggingFace generate when using beam search with kv-caching.
+
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        return [
+            tuple(
+                (past_state.index_select(0, beam_idx) for past_state in layer_past)
+            )
+            for layer_past in past_key_values
+        ]
\ No newline at end of file
--- a/diffusion/model/llava/mpt/norm.py
+++ b/diffusion/model/llava/mpt/norm.py
+import torch
+
+def _cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
+
+class LPLayerNorm(torch.nn.LayerNorm):
+
+    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+
+    def forward(self, x):
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
+
+def rms_norm(x, weight=None, eps=1e-05):
+    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    return output * weight if weight is not None else output
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
+        super().__init__()
+        self.eps = eps
+        if weight:
+            self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
+        else:
+            self.register_parameter('weight', None)
+
+    def forward(self, x):
+        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
+
+class LPRMSNorm(RMSNorm):
+
+    def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
+        super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
+
+    def forward(self, x):
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
\ No newline at end of file
--- a/diffusion/model/llava/mpt/param_init_fns.py
+++ b/diffusion/model/llava/mpt/param_init_fns.py
+import math
+import warnings
+from collections.abc import Sequence
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from .norm import NORM_CLASS_REGISTRY
+
+def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
+    del kwargs
+    if verbose > 1:
+        warnings.warn("Initializing network using module's reset_parameters attribute")
+    if hasattr(module, 'reset_parameters'):
+        module.reset_parameters()
+
+def fused_init_helper_(module: nn.Module, init_fn_):
+    _fused = getattr(module, '_fused', None)
+    if _fused is None:
+        raise RuntimeError('Internal logic error')
+    (dim, splits) = _fused
+    splits = (0, *splits, module.weight.size(dim))
+    for (s, e) in zip(splits[:-1], splits[1:]):
+        slice_indices = [slice(None)] * module.weight.ndim
+        slice_indices[dim] = slice(s, e)
+        init_fn_(module.weight[slice_indices])
+
+def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
+    del kwargs
+    if verbose > 1:
+        warnings.warn('If model has bias parameters they are initialized to 0.')
+    init_div_is_residual = init_div_is_residual
+    if init_div_is_residual is False:
+        div_is_residual = 1.0
+    elif init_div_is_residual is True:
+        div_is_residual = math.sqrt(2 * n_layers)
+    elif isinstance(init_div_is_residual, (float, int)):
+        div_is_residual = init_div_is_residual
+    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
+        div_is_residual = float(init_div_is_residual)
+    else:
+        div_is_residual = 1.0
+        raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
+    if init_div_is_residual is not False and verbose > 1:
+        warnings.warn(
+            f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. Set `init_div_is_residual: false` in init config to disable this.'
+        )
+    if isinstance(module, nn.Linear):
+        if hasattr(module, '_fused'):
+            fused_init_helper_(module, init_fn_)
+        else:
+            init_fn_(module.weight)
+        if module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+        if init_div_is_residual is not False and getattr(module, '_is_residual', False):
+            with torch.no_grad():
+                module.weight.div_(div_is_residual)
+    elif isinstance(module, nn.Embedding):
+        if emb_init_std is not None:
+            std = emb_init_std
+            if std == 0:
+                warnings.warn('Embedding layer initialized to 0.')
+            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+            if verbose > 1:
+                warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
+        elif emb_init_uniform_lim is not None:
+            lim = emb_init_uniform_lim
+            if isinstance(lim, Sequence):
+                if len(lim) > 2:
+                    raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
+                if lim[0] == lim[1]:
+                    warnings.warn(f'Embedding layer initialized to {lim[0]}.')
+            else:
+                if lim == 0:
+                    warnings.warn('Embedding layer initialized to 0.')
+                lim = [-lim, lim]
+            (a, b) = lim
+            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+            if verbose > 1:
+                warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
+        else:
+            emb_init_fn_ = init_fn_
+        emb_init_fn_(module.weight)
+    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
+        if verbose > 1:
+            warnings.warn(
+                'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.'
+            )
+        if hasattr(module, 'weight') and module.weight is not None:
+            torch.nn.init.ones_(module.weight)
+        if hasattr(module, 'bias') and module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.MultiheadAttention):
+        if module._qkv_same_embed_dim:
+            _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_)
+        else:
+            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
+            assert module.in_proj_weight is None
+            init_fn_(module.q_proj_weight)
+            init_fn_(module.k_proj_weight)
+            init_fn_(module.v_proj_weight)
+        if module.in_proj_bias is not None:
+            torch.nn.init.zeros_(module.in_proj_bias)
+        if module.bias_k is not None:
+            torch.nn.init.zeros_(module.bias_k)
+        if module.bias_v is not None:
+            torch.nn.init.zeros_(module.bias_v)
+        init_fn_(module.out_proj.weight)
+        if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
+            with torch.no_grad():
+                module.out_proj.weight.div_(div_is_residual)
+        if module.out_proj.bias is not None:
+            torch.nn.init.zeros_(module.out_proj.bias)
+    else:
+        for _ in module.parameters(recurse=False):
+            raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
+
+
+# TODO Rename this here and in `generic_param_init_fn_`
+def _extracted_from_generic_param_init_fn__69(module, d_model, init_fn_):
+    assert module.in_proj_weight is not None
+    assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
+    assert d_model is not None
+    _d = d_model
+    splits = (0, _d, 2 * _d, 3 * _d)
+    for (s, e) in zip(splits[:-1], splits[1:]):
+        init_fn_(module.in_proj_weight[s:e])
+
+def _normal_init_(std, mean=0.0):
+    return partial(torch.nn.init.normal_, mean=mean, std=std)
+
+def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
+    del kwargs
+    init_fn_ = _normal_init_(std=std)
+    if verbose > 1:
+        warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
+    generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
+    del kwargs
+    if init_std is None:
+        raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
+    _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
+    del kwargs
+    std = math.sqrt(2 / (5 * d_model))
+    _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
+    """From section 2.3.1 of GPT-NeoX-20B:
+
+    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
+    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
+    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
+    """
+    del kwargs
+    residual_div = n_layers / math.sqrt(10)
+    if verbose > 1:
+        warnings.warn(f'setting init_div_is_residual to {residual_div}')
+    small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(
+            f'Using nn.init.kaiming_uniform_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
+        )
+    kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(
+            f'Using nn.init.kaiming_normal_ init fn with parameters: a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
+        )
+    kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
+    generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
+    del kwargs
+    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f'Using torch.nn.init.xavier_uniform_ init fn with parameters: gain={init_gain}'
+        )
+    generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+
+def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
+    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f'Using torch.nn.init.xavier_normal_ init fn with parameters: gain={init_gain}'
+        )
+    generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
\ No newline at end of file
--- a/diffusion/model/nets/PixArt.py
+++ b/diffusion/model/nets/PixArt.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+import os
+import numpy as np
+from timm.models.layers import DropPath
+from timm.models.vision_transformer import PatchEmbed, Mlp
+
+from diffusion.model.builder import MODELS
+from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
+from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, LabelEmbedder, FinalLayer
+from diffusion.utils.logger import get_root_logger
+
+
+class PixArtBlock(nn.Module):
+    """
+    A PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
+                                    input_size=input_size if window_size == 0 else (window_size, window_size),
+                                    use_rel_pos=use_rel_pos, **block_kwargs)
+        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        # to be compatible with lower version pytorch
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.window_size = window_size
+        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
+
+    def forward(self, x, y, t, mask=None, **kwargs):
+        B, N, C = x.shape
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
+        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)).reshape(B, N, C))
+        x = x + self.cross_attn(x, y, mask)
+        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
+
+        return x
+
+
+#############################################################################
+#                                 Core PixArt Model                                #
+#################################################################################
+@MODELS.register_module()
+class PixArt(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1.0, config=None, model_max_length=120, **kwargs):
+        if window_block_indexes is None:
+            window_block_indexes = []
+        super().__init__()
+        self.pred_sigma = pred_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if pred_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.lewei_scale = lewei_scale,
+
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        num_patches = self.x_embedder.num_patches
+        self.base_size = input_size // self.patch_size
+        # Will use fixed sin-cos embedding:
+        self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
+
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.t_block = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            PixArtBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
+                          input_size=(input_size // patch_size, input_size // patch_size),
+                          window_size=window_size if i in window_block_indexes else 0,
+                          use_rel_pos=use_rel_pos if i in window_block_indexes else False)
+            for i in range(depth)
+        ])
+        self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)
+
+        self.initialize_weights()
+
+        if config:
+            logger = get_root_logger(os.path.join(config.work_dir, 'train_log.log'))
+            logger.warning(f"lewei scale: {self.lewei_scale}, base size: {self.base_size}")
+        else:
+            print(f'Warning: lewei scale: {self.lewei_scale}, base size: {self.base_size}')
+
+    def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs):
+        """
+        Forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) tensor of class labels
+        """
+        x = x.to(self.dtype)
+        timestep = timestep.to(self.dtype)
+        y = y.to(self.dtype)
+        pos_embed = self.pos_embed.to(self.dtype)
+        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep.to(x.dtype))  # (N, D)
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = [y.shape[2]] * y.shape[0]
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+        for block in self.blocks:
+            x = auto_grad_checkpoint(block, x, y, t0, y_lens)  # (N, T, D) #support grad checkpoint
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        return x
+
+    def forward_with_dpmsolver(self, x, timestep, y, mask=None, **kwargs):
+        """
+        dpm solver donnot need variance prediction
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        model_out = self.forward(x, timestep, y, mask)
+        return model_out.chunk(2, dim=1)[0]
+
+    def forward_with_cfg(self, x, timestep, y, cfg_scale, mask=None, **kwargs):
+        """
+        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, timestep, y, mask, kwargs)
+        model_out = model_out['x'] if isinstance(model_out, dict) else model_out
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        return x.reshape(shape=(x.shape[0], c, h * p, h * p))
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), lewei_scale=self.lewei_scale, base_size=self.base_size)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.t_block[1].weight, std=0.02)
+
+        # Initialize caption embedding MLP:
+        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
+
+        # Zero-out adaLN modulation layers in PixArt blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.cross_attn.proj.weight, 0)
+            nn.init.constant_(block.cross_attn.proj.bias, 0)
+
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, lewei_scale=1.0, base_size=16):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = to_2tuple(grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0]/base_size) / lewei_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1]/base_size) / lewei_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    return np.concatenate([emb_h, emb_w], axis=1)
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    return np.concatenate([emb_sin, emb_cos], axis=1)
+
+
+#################################################################################
+#                                   PixArt Configs                                  #
+#################################################################################
+@MODELS.register_module()
+def PixArt_XL_2(**kwargs):
+    return PixArt(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
--- a/diffusion/model/nets/PixArtMS.py
+++ b/diffusion/model/nets/PixArtMS.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath
+from timm.models.vision_transformer import Mlp
+
+from diffusion.model.builder import MODELS
+from diffusion.model.utils import auto_grad_checkpoint, to_2tuple
+from diffusion.model.nets.PixArt_blocks import t2i_modulate, CaptionEmbedder, WindowAttention, MultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, SizeEmbedder
+from diffusion.model.nets.PixArt import PixArt, get_2d_sincos_pos_embed
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(
+            self,
+            patch_size=16,
+            in_chans=3,
+            embed_dim=768,
+            norm_layer=None,
+            flatten=True,
+            bias=True,
+    ):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class PixArtMSBlock(nn.Module):
+    """
+    A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., window_size=0, input_size=None, use_rel_pos=False, **block_kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = WindowAttention(hidden_size, num_heads=num_heads, qkv_bias=True,
+                              input_size=input_size if window_size == 0 else (window_size, window_size),
+                              use_rel_pos=use_rel_pos, **block_kwargs)
+        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        # to be compatible with lower version pytorch
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, drop=0)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.window_size = window_size
+        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
+
+    def forward(self, x, y, t, mask=None, **kwargs):
+        B, N, C = x.shape
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None] + t.reshape(B, 6, -1)).chunk(6, dim=1)
+        x = x + self.drop_path(gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa)))
+        x = x + self.cross_attn(x, y, mask)
+        x = x + self.drop_path(gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
+
+        return x
+
+
+#############################################################################
+#                                 Core PixArt Model                                #
+#################################################################################
+@MODELS.register_module()
+class PixArtMS(PixArt):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4.0, class_dropout_prob=0.1, learn_sigma=True, pred_sigma=True, drop_path: float = 0., window_size=0, window_block_indexes=None, use_rel_pos=False, caption_channels=4096, lewei_scale=1., config=None, model_max_length=120, **kwargs):
+        if window_block_indexes is None:
+            window_block_indexes = []
+        super().__init__(
+            input_size=input_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            hidden_size=hidden_size,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            class_dropout_prob=class_dropout_prob,
+            learn_sigma=learn_sigma,
+            pred_sigma=pred_sigma,
+            drop_path=drop_path,
+            window_size=window_size,
+            window_block_indexes=window_block_indexes,
+            use_rel_pos=use_rel_pos,
+            lewei_scale=lewei_scale,
+            config=config,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+        self.h = self.w = 0
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.t_block = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.x_embedder = PatchEmbed(patch_size, in_channels, hidden_size, bias=True)
+        self.y_embedder = CaptionEmbedder(in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, act_layer=approx_gelu, token_num=model_max_length)
+        self.csize_embedder = SizeEmbedder(hidden_size//3)  # c_size embed
+        self.ar_embedder = SizeEmbedder(hidden_size//3)     # aspect ratio embed
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            PixArtMSBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
+                          input_size=(input_size // patch_size, input_size // patch_size),
+                          window_size=window_size if i in window_block_indexes else 0,
+                          use_rel_pos=use_rel_pos if i in window_block_indexes else False)
+            for i in range(depth)
+        ])
+        self.final_layer = T2IFinalLayer(hidden_size, patch_size, self.out_channels)
+
+        self.initialize()
+
+    def forward(self, x, timestep, y, mask=None, data_info=None, **kwargs):
+        """
+        Forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) tensor of class labels
+        """
+        bs = x.shape[0]
+        x = x.to(self.dtype)
+        timestep = timestep.to(self.dtype)
+        y = y.to(self.dtype)
+        c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
+        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
+        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep)  # (N, D)
+        csize = self.csize_embedder(c_size, bs)  # (N, D)
+        ar = self.ar_embedder(ar, bs)  # (N, D)
+        t = t + torch.cat([csize, ar], dim=1)
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, D)
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = [y.shape[2]] * y.shape[0]
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+        for block in self.blocks:
+            x = auto_grad_checkpoint(block, x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        return x
+
+    def forward_with_dpmsolver(self, x, timestep, y, data_info, **kwargs):
+        """
+        dpm solver donnot need variance prediction
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        model_out = self.forward(x, timestep, y, data_info=data_info, **kwargs)
+        return model_out.chunk(2, dim=1)[0]
+
+    def forward_with_cfg(self, x, timestep, y, cfg_scale, data_info, **kwargs):
+        """
+        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, timestep, y, data_info=data_info)
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        assert self.h * self.w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        return x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))
+
+    def initialize(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.t_block[1].weight, std=0.02)
+        nn.init.normal_(self.csize_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.csize_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.ar_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.ar_embedder.mlp[2].weight, std=0.02)
+
+        # Initialize caption embedding MLP:
+        nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02)
+
+        # Zero-out adaLN modulation layers in PixArt blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.cross_attn.proj.weight, 0)
+            nn.init.constant_(block.cross_attn.proj.bias, 0)
+
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+
+#################################################################################
+#                                   PixArt Configs                                  #
+#################################################################################
+@MODELS.register_module()
+def PixArtMS_XL_2(**kwargs):
+    return PixArtMS(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
--- a/diffusion/model/nets/PixArt_blocks.py
+++ b/diffusion/model/nets/PixArt_blocks.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+from timm.models.vision_transformer import Mlp, Attention as Attention_
+from einops import rearrange, repeat
+import xformers.ops
+
+from diffusion.model.utils import add_decomposed_rel_pos
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+
+
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., **block_kwargs):
+        super(MultiHeadCrossAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.kv_linear = nn.Linear(d_model, d_model*2)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(d_model, d_model)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, cond, mask=None):
+        # query/value: img tokens; key: condition; mask: if padding tokens
+        B, N, C = x.shape
+
+        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(2)
+        attn_bias = None
+        if mask is not None:
+            attn_bias = xformers.ops.fmha.BlockDiagonalMask.from_seqlens([N] * B, mask)
+        # x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.fmha.MemoryEfficientAttentionFlashAttentionOp)
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.TritonFlashAttentionOp)
+        
+        x = x.view(B, -1, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        # q = self.q_linear(x).reshape(B, -1, self.num_heads, self.head_dim)
+        # kv = self.kv_linear(cond).reshape(B, -1, 2, self.num_heads, self.head_dim)
+        # k, v = kv.unbind(2)
+        # attn_bias = None
+        # if mask is not None:
+        #     attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+        #     attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+        # x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias)
+        # x = x.contiguous().reshape(B, -1, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+
+        return x
+
+
+class WindowAttention(Attention_):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+        **block_kwargs,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, **block_kwargs)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim))
+
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = qkv.unbind(2)
+        if use_fp32_attention := getattr(self, 'fp32_attention', False):
+            q, k, v = q.float(), k.float(), v.float()
+
+        attn_bias = None
+        if mask is not None:
+            attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+            attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+            
+        # x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.fmha.MemoryEfficientAttentionFlashAttentionOp)
+        x = xformers.ops.memory_efficient_attention(q, k, v, p=self.attn_drop.p, attn_bias=attn_bias, op=xformers.ops.TritonFlashAttentionOp)
+
+        x = x.view(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+#################################################################################
+#   AMP attention with fp32 softmax to fix loss NaN problem during training     #
+#################################################################################
+class Attention(Attention_):
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        use_fp32_attention = getattr(self, 'fp32_attention', False)
+        if use_fp32_attention:
+            q, k = q.float(), k.float()
+        with torch.cuda.amp.autocast(enabled=not use_fp32_attention):
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
+        self.out_channels = out_channels
+
+    def forward(self, x, t):
+        shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class MaskFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+
+    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(c_emb_size, 2 * final_hidden_size, bias=True)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class DecoderLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+
+    def __init__(self, hidden_size, decoder_hidden_size):
+        super().__init__()
+        self.norm_decoder = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, decoder_hidden_size, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_decoder(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(self.dtype)
+        return self.mlp(t_freq)
+
+    @property
+    def dtype(self):
+        # 返回模型参数的数据类型
+        return next(self.parameters()).dtype
+
+
+class SizeEmbedder(TimestepEmbedder):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.outdim = hidden_size
+
+    def forward(self, s, bs):
+        if s.ndim == 1:
+            s = s[:, None]
+        assert s.ndim == 2
+        if s.shape[0] != bs:
+            s = s.repeat(bs//s.shape[0], 1)
+            assert s.shape[0] == bs
+        b, dims = s.shape[0], s.shape[1]
+        s = rearrange(s, "b d -> (b d)")
+        s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
+        s_emb = self.mlp(s_freq)
+        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        return s_emb
+
+    @property
+    def dtype(self):
+        # 返回模型参数的数据类型
+        return next(self.parameters()).dtype
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        return self.embedding_table(labels)
+
+
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
+        super().__init__()
+        self.y_proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
+        self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return caption
+
+    def forward(self, caption, train, force_drop_ids=None):
+        if train:
+            assert caption.shape[2:] == self.y_embedding.shape
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids)
+        caption = self.y_proj(caption)
+        return caption
+
+
+class CaptionEmbedderDoubleBr(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120):
+        super().__init__()
+        self.proj = Mlp(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer, drop=0)
+        self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5)
+        self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5)
+        self.uncond_prob = uncond_prob
+
+    def token_drop(self, global_caption, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption)
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return global_caption, caption
+
+    def forward(self, caption, train, force_drop_ids=None):
+        assert caption.shape[2: ] == self.y_embedding.shape
+        global_caption = caption.mean(dim=2).squeeze()
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids)
+        y_embed = self.proj(global_caption)
+        return y_embed, caption
--- a/diffusion/model/nets/__init__.py
+++ b/diffusion/model/nets/__init__.py
+from .PixArt import PixArt, PixArt_XL_2
+from .PixArtMS import PixArtMS, PixArtMS_XL_2, PixArtMSBlock
+from .pixart_controlnet import ControlPixArtHalf, ControlPixArtMSHalf
\ No newline at end of file
--- a/diffusion/model/nets/pixart_controlnet.py
+++ b/diffusion/model/nets/pixart_controlnet.py
+import re
+import torch
+import torch.nn as nn
+
+from copy import deepcopy
+from torch import Tensor
+from torch.nn import Module, Linear, init
+from typing import Any, Mapping
+
+from diffusion.model.nets import PixArtMSBlock, PixArtMS, PixArt
+from diffusion.model.nets.PixArt import get_2d_sincos_pos_embed
+from diffusion.model.utils import auto_grad_checkpoint
+
+
+# The implementation of ControlNet-Half architrecture
+# https://github.com/lllyasviel/ControlNet/discussions/188
+class ControlT2IDitBlockHalf(Module):
+    def __init__(self, base_block: PixArtMSBlock, block_index: 0) -> None:
+        super().__init__()
+        self.copied_block = deepcopy(base_block)
+        self.block_index = block_index
+
+        for p in self.copied_block.parameters():
+            p.requires_grad_(True)
+
+        self.copied_block.load_state_dict(base_block.state_dict())
+        self.copied_block.train()
+        
+        self.hidden_size = hidden_size = base_block.hidden_size
+        if self.block_index == 0:
+            self.before_proj = Linear(hidden_size, hidden_size)
+            init.zeros_(self.before_proj.weight)
+            init.zeros_(self.before_proj.bias)
+        self.after_proj = Linear(hidden_size, hidden_size) 
+        init.zeros_(self.after_proj.weight)
+        init.zeros_(self.after_proj.bias)
+
+    def forward(self, x, y, t, mask=None, c=None):
+        
+        if self.block_index == 0:
+            # the first block
+            c = self.before_proj(c)
+            c = self.copied_block(x + c, y, t, mask)
+            c_skip = self.after_proj(c)
+        else:
+            # load from previous c and produce the c for skip connection
+            c = self.copied_block(c, y, t, mask)
+            c_skip = self.after_proj(c)
+        
+        return c, c_skip
+        
+
+# The implementation of ControlPixArtHalf net
+class ControlPixArtHalf(Module):
+    # only support single res model
+    def __init__(self, base_model: PixArt, copy_blocks_num: int = 13) -> None:
+        super().__init__()
+        self.base_model = base_model.eval()
+        self.controlnet = []
+        self.copy_blocks_num = copy_blocks_num
+        self.total_blocks_num = len(base_model.blocks)
+        for p in self.base_model.parameters():
+            p.requires_grad_(False)
+
+        # Copy first copy_blocks_num block
+        for i in range(copy_blocks_num):
+            self.controlnet.append(ControlT2IDitBlockHalf(base_model.blocks[i], i))
+        self.controlnet = nn.ModuleList(self.controlnet)
+    
+    def __getattr__(self, name: str) -> Tensor or Module:
+        if name in ['forward', 'forward_with_dpmsolver', 'forward_with_cfg', 'forward_c', 'load_state_dict']:
+            return self.__dict__[name]
+        elif name in ['base_model', 'controlnet']:
+            return super().__getattr__(name)
+        else:
+            return getattr(self.base_model, name)
+
+    def forward_c(self, c):
+        self.h, self.w = c.shape[-2]//self.patch_size, c.shape[-1]//self.patch_size
+        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(c.device).to(self.dtype)
+        return self.x_embedder(c) + pos_embed if c is not None else c
+
+    # def forward(self, x, t, c, **kwargs):
+    #     return self.base_model(x, t, c=self.forward_c(c), **kwargs)
+    def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
+        # modify the original PixArtMS forward function
+        if c is not None:
+            c = c.to(self.dtype)
+            c = self.forward_c(c)
+        """
+        Forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) tensor of class labels
+        """
+        x = x.to(self.dtype)
+        timestep = timestep.to(self.dtype)
+        y = y.to(self.dtype)
+        pos_embed = self.pos_embed.to(self.dtype)
+        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep.to(x.dtype))  # (N, D)
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, 1, L, D)
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = [y.shape[2]] * y.shape[0]
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+
+        # define the first layer
+        x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint
+
+        if c is not None:
+            # update c
+            for index in range(1, self.copy_blocks_num + 1):
+                c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
+        
+            # update x
+            for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
+        else:
+            for index in range(1, self.total_blocks_num):
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
+
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        return x
+
+    def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
+        model_out = self.forward(x, t, y, data_info=data_info, c=c, **kwargs)
+        return model_out.chunk(2, dim=1)[0]
+
+    # def forward_with_dpmsolver(self, x, t, y, data_info, c, **kwargs):
+    #     return self.base_model.forward_with_dpmsolver(x, t, y, data_info=data_info, c=self.forward_c(c), **kwargs)
+
+    def forward_with_cfg(self, x, t, y, cfg_scale, data_info, c, **kwargs):
+        return self.base_model.forward_with_cfg(x, t, y, cfg_scale, data_info, c=self.forward_c(c), **kwargs)
+
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        if all((k.startswith('base_model') or k.startswith('controlnet')) for k in state_dict.keys()):
+            return super().load_state_dict(state_dict, strict)
+        else:
+            new_key = {}
+            for k in state_dict.keys():
+                new_key[k] = re.sub(r"(blocks\.\d+)(.*)", r"\1.base_block\2", k)
+            for k, v in new_key.items():
+                if k != v:
+                    print(f"replace {k} to {v}")
+                    state_dict[v] = state_dict.pop(k)
+
+            return self.base_model.load_state_dict(state_dict, strict)
+    
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        assert self.h * self.w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], self.h, self.w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, self.h * p, self.w * p))
+        return imgs
+
+    @property
+    def dtype(self):
+        # 返回模型参数的数据类型
+        return next(self.parameters()).dtype
+
+
+# The implementation for PixArtMS_Half + 1024 resolution
+class ControlPixArtMSHalf(ControlPixArtHalf):
+    # support multi-scale res model (multi-scale model can also be applied to single reso training & inference)
+    def __init__(self, base_model: PixArtMS, copy_blocks_num: int = 13) -> None:
+        super().__init__(base_model=base_model, copy_blocks_num=copy_blocks_num)
+
+    def forward(self, x, timestep, y, mask=None, data_info=None, c=None, **kwargs):
+        # modify the original PixArtMS forward function
+        """
+        Forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) tensor of class labels
+        """
+        if c is not None:
+            c = c.to(self.dtype)
+            c = self.forward_c(c)
+        bs = x.shape[0]
+        x = x.to(self.dtype)
+        timestep = timestep.to(self.dtype)
+        y = y.to(self.dtype)
+        c_size, ar = data_info['img_hw'].to(self.dtype), data_info['aspect_ratio'].to(self.dtype)
+        self.h, self.w = x.shape[-2]//self.patch_size, x.shape[-1]//self.patch_size
+
+        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(x.device).to(self.dtype)
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep)  # (N, D)
+        csize = self.csize_embedder(c_size, bs)  # (N, D)
+        ar = self.ar_embedder(ar, bs)  # (N, D)
+        t = t + torch.cat([csize, ar], dim=1)
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, D)
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = [y.shape[2]] * y.shape[0]
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+
+        # define the first layer
+        x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, **kwargs)  # (N, T, D) #support grad checkpoint
+
+        if c is not None:
+            # update c
+            for index in range(1, self.copy_blocks_num + 1):
+                c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, **kwargs)
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, **kwargs)
+        
+            # update x
+            for index in range(self.copy_blocks_num + 1, self.total_blocks_num):
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
+        else:
+            for index in range(1, self.total_blocks_num):
+                x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, **kwargs)
+
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)  # (N, out_channels, H, W)
+        return x
--- a/diffusion/model/respace.py
+++ b/diffusion/model/respace.py
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+import numpy as np
+import torch as th
+
+from .gaussian_diffusion import GaussianDiffusion
+
+
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+
+
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+
+    def training_losses_diffusers(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses_diffusers(self._wrap_model(model), *args, **kwargs)
+
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+
+
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+
+    def __call__(self, x, timestep, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=timestep.device, dtype=timestep.dtype)
+        new_ts = map_tensor[timestep]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, timestep=new_ts, **kwargs)
--- a/diffusion/model/sa_solver.py
+++ b/diffusion/model/sa_solver.py
+import torch
+import torch.nn.functional as F
+import math
+from tqdm import tqdm
+
+
+class NoiseScheduleVP:
+    def __init__(
+            self,
+            schedule='discrete',
+            betas=None,
+            alphas_cumprod=None,
+            continuous_beta_0=0.1,
+            continuous_beta_1=20.,
+            dtype=torch.float32,
+    ):
+        """Thanks to DPM-Solver for their code base"""
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+            schedule are the default settings in DDPM and improved-DDPM:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' or 'cosine' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+
+        if schedule not in ['discrete', 'linear', 'cosine']:
+            raise ValueError(
+                f"Unsupported noise schedule {schedule}. The schedule needs to be 'discrete' or 'linear' or 'cosine'"
+            )
+
+        self.schedule = schedule
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.T = 1.
+            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
+            self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype)
+        else:
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+            self.cosine_s = 0.008
+            self.cosine_beta_max = 999.
+            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
+            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
+            self.schedule = schedule
+            self.T = 0.9946 if schedule == 'cosine' else 1.
+
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
+                                  self.log_alpha_array.to(t.device)).reshape((-1))
+        elif self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == 'cosine':
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
+            return log_alpha_fn(t) - self.cosine_log_alpha_0
+
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0 ** 2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                               torch.flip(self.t_array.to(lamb.device), [1]))
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
+                        1. + self.cosine_s) / math.pi - self.cosine_s
+            return t_fn(log_alpha)
+
+    def edm_sigma(self, t):
+        return self.marginal_std(t) / self.marginal_alpha(t)
+
+    def edm_inverse_sigma(self, edmsigma):
+        alpha = 1 / (edmsigma ** 2 + 1).sqrt()
+        sigma = alpha * edmsigma
+        lambda_t = torch.log(alpha / sigma)
+        return self.inverse_lambda(lambda_t)
+
+
+def model_wrapper(
+        model,
+        noise_schedule,
+        model_type="noise",
+        model_kwargs={},
+        guidance_type="uncond",
+        condition=None,
+        unconditional_condition=None,
+        guidance_scale=1.,
+        classifier_fn=None,
+        classifier_kwargs={},
+):
+    """Thanks to DPM-Solver for their code base"""
+    """Create a wrapper function for the noise prediction model.
+    SA-Solver needs to solve the continuous-time diffusion SDEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for SA-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == 'discrete':
+            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
+        else:
+            return t_continuous
+
+    def noise_pred_fn(x, t_continuous, cond=None):
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            output = model(x, t_input, **model_kwargs)
+        else:
+            output = model(x, t_input, cond, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return (x - alpha_t[0] * output) / sigma_t[0]
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            return alpha_t[0] * output + sigma_t[0] * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            return -sigma_t[0] * output
+
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * sigma_t * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1. or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t_continuous] * 2)
+            c_in = torch.cat([unconditional_condition, condition])
+            noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+            return noise_uncond + guidance_scale * (noise - noise_uncond)
+
+    assert model_type in ["noise", "x_start", "v", "score"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+
+
+class SASolver:
+    def __init__(
+            self,
+            model_fn,
+            noise_schedule,
+            algorithm_type="data_prediction",
+            correcting_x0_fn=None,
+            correcting_xt_fn=None,
+            thresholding_max_val=1.,
+            dynamic_thresholding_ratio=0.995
+    ):
+        """
+        Construct a SA-Solver
+        The default value for algorithm_type is "data_prediction" and we recommend not to change it to
+        "noise_prediction". For details, please see Appendix A.2.4 in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        self.noise_schedule = noise_schedule
+        assert algorithm_type in ["data_prediction", "noise_prediction"]
+
+        if correcting_x0_fn == "dynamic_thresholding":
+            self.correcting_x0_fn = self.dynamic_thresholding_fn
+        else:
+            self.correcting_x0_fn = correcting_x0_fn
+
+        self.correcting_xt_fn = correcting_xt_fn
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.thresholding_max_val = thresholding_max_val
+
+        self.predict_x0 = algorithm_type == "data_prediction"
+
+        self.sigma_min = float(self.noise_schedule.edm_sigma(torch.tensor([1e-3])))
+        self.sigma_max = float(self.noise_schedule.edm_sigma(torch.tensor([1])))
+
+    def dynamic_thresholding_fn(self, x0, t=None):
+        """
+        The dynamic thresholding method.
+        """
+        dims = x0.dim()
+        p = self.dynamic_thresholding_ratio
+        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
+        x0 = torch.clamp(x0, -s, s) / s
+        return x0
+
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+
+    def data_prediction_fn(self, x, t):
+        """
+        Return the data prediction model (with corrector).
+        """
+        noise = self.noise_prediction_fn(x, t)
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        x0 = (x - sigma_t * noise) / alpha_t
+        if self.correcting_x0_fn is not None:
+            x0 = self.correcting_x0_fn(x0)
+        return x0
+
+    def model_fn(self, x, t):
+        """
+        Convert the model to the noise prediction model or the data prediction model.
+        """
+
+        if self.predict_x0:
+            return self.data_prediction_fn(x, t)
+        else:
+            return self.noise_prediction_fn(x, t)
+
+    def get_time_steps(self, skip_type, t_T, t_0, N, order, device):
+        """Compute the intermediate time steps for sampling.
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = lambda_T + torch.linspace(torch.tensor(0.).cpu().item(),
+                                                     (lambda_0 - lambda_T).cpu().item() ** (1. / order), N + 1).pow(
+                order).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 'time':
+            t = torch.linspace(t_T ** (1. / order), t_0 ** (1. / order), N + 1).pow(order).to(device)
+            return t
+        elif skip_type == 'karras':
+            sigma_min = max(0.002, self.sigma_min)
+            sigma_max = min(80, self.sigma_max)
+            sigma_steps = torch.linspace(sigma_max ** (1. / 7), sigma_min ** (1. / 7), N + 1).pow(7).to(device)
+            return self.noise_schedule.edm_inverse_sigma(sigma_steps)
+        else:
+            raise ValueError(
+                f"Unsupported skip_type {skip_type}, need to be 'logSNR' or 'time' or 'karras'"
+            )
+
+    def denoise_to_zero_fn(self, x, s):
+        """
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+        """
+        return self.data_prediction_fn(x, s)
+
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        For calculating the coefficient of gradient terms after the lagrange interpolation,
+        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        For noise_prediction formula.
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                        (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1))
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start) - (
+                            interval_end ** 2 + 2 * interval_end + 2))
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                        (interval_start ** 3 + 3 * interval_start ** 2 + 6 * interval_start + 6) * torch.exp(
+                    interval_end - interval_start) - (interval_end ** 3 + 3 * interval_end ** 2 + 6 * interval_end + 6))
+
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        For calculating the coefficient of gradient terms after the lagrange interpolation,
+        see Eq.(15) and Eq.(18) in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        For data_prediction formula.
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau ** 2) * interval_end
+        interval_start_cov = (1 + tau ** 2) * interval_start
+
+        if order == 0:
+            return torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (
+            (1 + tau ** 2))
+        elif order == 1:
+            return torch.exp(interval_end_cov) * ((interval_end_cov - 1) - (interval_start_cov - 1) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 2)
+        elif order == 2:
+            return torch.exp(interval_end_cov) * ((interval_end_cov ** 2 - 2 * interval_end_cov + 2) - (
+                        interval_start_cov ** 2 - 2 * interval_start_cov + 2) * torch.exp(
+                -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 3)
+        elif order == 3:
+            return torch.exp(interval_end_cov) * (
+                        (interval_end_cov ** 3 - 3 * interval_end_cov ** 2 + 6 * interval_end_cov - 6) - (
+                            interval_start_cov ** 3 - 3 * interval_start_cov ** 2 + 6 * interval_start_cov - 6) * torch.exp(
+                    -(interval_end_cov - interval_start_cov))) / ((1 + tau ** 2) ** 4)
+
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        For lagrange interpolation
+        """
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [[1 / (lambda_list[0] - lambda_list[1]), -lambda_list[1] / (lambda_list[0] - lambda_list[1])],
+                    [1 / (lambda_list[1] - lambda_list[0]), -lambda_list[0] / (lambda_list[1] - lambda_list[0])]]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2]) / denominator1,
+                     lambda_list[1] * lambda_list[2] / denominator1],
+
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2]) / denominator2,
+                     lambda_list[0] * lambda_list[2] / denominator2],
+
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1]) / denominator3,
+                     lambda_list[0] * lambda_list[1] / denominator3]
+                    ]
+        elif order == 3:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2]) * (
+                        lambda_list[0] - lambda_list[3])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2]) * (
+                        lambda_list[1] - lambda_list[3])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1]) * (
+                        lambda_list[2] - lambda_list[3])
+            denominator4 = (lambda_list[3] - lambda_list[0]) * (lambda_list[3] - lambda_list[1]) * (
+                        lambda_list[3] - lambda_list[2])
+            return [[1 / denominator1,
+                     (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                     (lambda_list[1] * lambda_list[2] + lambda_list[1] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator1,
+                     (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1],
+
+                    [1 / denominator2,
+                     (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                     (lambda_list[0] * lambda_list[2] + lambda_list[0] * lambda_list[3] + lambda_list[2] * lambda_list[
+                         3]) / denominator2,
+                     (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2],
+
+                    [1 / denominator3,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[3] + lambda_list[1] * lambda_list[
+                         3]) / denominator3,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3],
+
+                    [1 / denominator4,
+                     (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                     (lambda_list[0] * lambda_list[1] + lambda_list[0] * lambda_list[2] + lambda_list[1] * lambda_list[
+                         2]) / denominator4,
+                     (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4]
+
+                    ]
+
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        """
+        Calculate the coefficient of gradients.
+        """
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), 'the length of lambda list must be equal to the order'
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = sum(
+                lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_positive(
+                    order - 1 - j, interval_start, interval_end, tau
+                )
+                if self.predict_x0
+                else lagrange_coefficient[i][j]
+                * self.get_coefficients_exponential_negative(
+                    order - 1 - j, interval_start, interval_end
+                )
+                for j in range(order)
+            )
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, 'the length of coefficients does not match the order'
+        return coefficients
+
+    def adams_bashforth_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Predictor, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+
+        return x_t
+
+    def adams_moulton_update(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        t_list = t_prev_list + [t]
+        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+
+        return x_t
+
+    def adams_bashforth_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Predictor, with the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        lambda_list = [ns.marginal_lambda(t_prev_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
+                    t_prev_list[-2]))
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h ** 2 / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2)) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(
+                    t_prev_list[-2]))
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+
+        return x_t
+
+    def adams_moulton_update_few_steps(self, order, x, tau, model_prev_list, t_prev_list, noise, t):
+        """
+        SA-Corrector, without the "rescaling" trick in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        """
+
+        assert order in [1, 2, 3, 4], "order of stochastic adams bashforth method is only supported for 1, 2, 3 and 4"
+
+        # get noise schedule
+        ns = self.noise_schedule
+        alpha_t = ns.marginal_alpha(t)
+        sigma_t = ns.marginal_std(t)
+        lambda_t = ns.marginal_lambda(t)
+        alpha_prev = ns.marginal_alpha(t_prev_list[-1])
+        sigma_prev = ns.marginal_std(t_prev_list[-1])
+        gradient_part = torch.zeros_like(x)
+        h = lambda_t - ns.marginal_lambda(t_prev_list[-1])
+        t_list = t_prev_list + [t]
+        lambda_list = [ns.marginal_lambda(t_list[-(i + 1)]) for i in range(order)]
+        gradient_coefficients = self.get_coefficients_fn(order, ns.marginal_lambda(t_prev_list[-1]), lambda_t,
+                                                         lambda_list, tau)
+
+        if self.predict_x0:
+            if order == 2:  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                gradient_coefficients[0] += 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+                gradient_coefficients[1] -= 1.0 * torch.exp((1 + tau ** 2) * lambda_t) * (
+                            h / 2 - (h * (1 + tau ** 2) - 1 + torch.exp((1 + tau ** 2) * (-h))) / (
+                                (1 + tau ** 2) ** 2 * h))
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (1 + tau ** 2) * sigma_t * torch.exp(- tau ** 2 * lambda_t) * gradient_coefficients[
+                    i] * model_prev_list[-(i + 1)]
+            else:
+                gradient_part += -(1 + tau ** 2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau ** 2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-tau ** 2 * h) * (sigma_t / sigma_prev) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_prev) * x + gradient_part + noise_part
+
+        return x_t
+
+    def sample_few_steps(self, x, tau, steps=5, t_start=None, t_end=None, skip_type='time', skip_order=1,
+                         predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
+                         ):
+        """
+        For the PC-mode, please refer to the wiki page
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+        """
+
+        skip_first_step = False
+        skip_final_step = True
+        lower_order_final = True
+        denoise_to_zero = False
+
+        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            assert steps >= max(predictor_order, corrector_order - 1)
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
+                                            device=device)
+            assert timesteps.shape[0] - 1 == steps
+            # Init the initial values.
+            step = 0
+            t = timesteps[step]
+            noise = torch.randn_like(x)
+            t_prev_list = [t]
+            # do not evaluate if skip_first_step
+            if skip_first_step:
+                if self.predict_x0:
+                    alpha_t = self.noise_schedule.marginal_alpha(t)
+                    sigma_t = self.noise_schedule.marginal_std(t)
+                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
+                else:
+                    model_prev_list = [x]
+            else:
+                model_prev_list = [self.model_fn(x, t)]
+
+            if self.correcting_xt_fn is not None:
+                x = self.correcting_xt_fn(x, t, step)
+            if return_intermediate:
+                intermediates.append(x)
+
+            # determine the first several values
+            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
+
+                t = timesteps[step]
+                predictor_order_used = min(predictor_order, step)
+                corrector_order_used = min(corrector_order, step + 1)
+                noise = torch.randn_like(x)
+                # predictor step
+                x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                            noise=noise, t=t)
+                # evaluation step
+                model_x = self.model_fn(x_p, t)
+
+                # update model_list
+                model_prev_list.append(model_x)
+                # corrector step
+                if corrector_order > 0:
+                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                            noise=noise, t=t)
+                else:
+                    x = x_p
+
+                # evaluation step if correction and mode = pece
+                if corrector_order > 0 and pc_mode == 'PECE':
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+
+                t_prev_list.append(t)
+
+            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
+                if lower_order_final:
+                    predictor_order_used = min(predictor_order, steps - step + 1)
+                    corrector_order_used = min(corrector_order, steps - step + 2)
+
+                else:
+                    predictor_order_used = predictor_order
+                    corrector_order_used = corrector_order
+                t = timesteps[step]
+                noise = torch.randn_like(x)
+
+                # predictor step
+                if skip_final_step and step == steps and not denoise_to_zero:
+                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=0,
+                                                                model_prev_list=model_prev_list,
+                                                                t_prev_list=t_prev_list, noise=noise, t=t)
+                else:
+                    x_p = self.adams_bashforth_update_few_steps(order=predictor_order_used, x=x, tau=tau(t),
+                                                                model_prev_list=model_prev_list,
+                                                                t_prev_list=t_prev_list, noise=noise, t=t)
+
+                # evaluation step
+                # do not evaluate if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_x = self.model_fn(x_p, t)
+
+                # update model_list
+                # do not update if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_prev_list.append(model_x)
+
+                # corrector step
+                # do not correct if skip_final_step and step = steps
+                if corrector_order > 0 and (not skip_final_step or step < steps):
+                    x = self.adams_moulton_update_few_steps(order=corrector_order_used, x=x, tau=tau(t),
+                                                            model_prev_list=model_prev_list,
+                                                            t_prev_list=t_prev_list, noise=noise, t=t)
+                else:
+                    x = x_p
+
+                # evaluation step if mode = pece and step != steps
+                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+
+                t_prev_list.append(t)
+                del model_prev_list[0]
+
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        return (x, intermediates) if return_intermediate else x
+
+    def sample_more_steps(self, x, tau, steps=20, t_start=None, t_end=None, skip_type='time', skip_order=1,
+                          predictor_order=3, corrector_order=4, pc_mode='PEC', return_intermediate=False
+                          ):
+        """
+        For the PC-mode, please refer to the wiki page
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+        """
+
+        skip_first_step = False
+        skip_final_step = False
+        lower_order_final = True
+        denoise_to_zero = True
+
+        assert pc_mode in ['PEC', 'PECE'], 'Predictor-corrector mode only supports PEC and PECE'
+        t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+
+        device = x.device
+        intermediates = []
+        with torch.no_grad():
+            assert steps >= max(predictor_order, corrector_order - 1)
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, order=skip_order,
+                                            device=device)
+            assert timesteps.shape[0] - 1 == steps
+            # Init the initial values.
+            step = 0
+            t = timesteps[step]
+            noise = torch.randn_like(x)
+            t_prev_list = [t]
+            # do not evaluate if skip_first_step
+            if skip_first_step:
+                if self.predict_x0:
+                    alpha_t = self.noise_schedule.marginal_alpha(t)
+                    sigma_t = self.noise_schedule.marginal_std(t)
+                    model_prev_list = [(1 - sigma_t) / alpha_t * x]
+                else:
+                    model_prev_list = [x]
+            else:
+                model_prev_list = [self.model_fn(x, t)]
+
+            if self.correcting_xt_fn is not None:
+                x = self.correcting_xt_fn(x, t, step)
+            if return_intermediate:
+                intermediates.append(x)
+
+            # determine the first several values
+            for step in tqdm(range(1, max(predictor_order, corrector_order - 1))):
+
+                t = timesteps[step]
+                predictor_order_used = min(predictor_order, step)
+                corrector_order_used = min(corrector_order, step + 1)
+                noise = torch.randn_like(x)
+                # predictor step
+                x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
+                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
+                                                  t=t)
+                # evaluation step
+                model_x = self.model_fn(x_p, t)
+
+                # update model_list
+                model_prev_list.append(model_x)
+                # corrector step
+                if corrector_order > 0:
+                    x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
+                                                  model_prev_list=model_prev_list, t_prev_list=t_prev_list, noise=noise,
+                                                  t=t)
+                else:
+                    x = x_p
+
+                # evaluation step if mode = pece
+                if corrector_order > 0 and pc_mode == 'PECE':
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+
+                t_prev_list.append(t)
+
+            for step in tqdm(range(max(predictor_order, corrector_order - 1), steps + 1)):
+                if lower_order_final:
+                    predictor_order_used = min(predictor_order, steps - step + 1)
+                    corrector_order_used = min(corrector_order, steps - step + 2)
+
+                else:
+                    predictor_order_used = predictor_order
+                    corrector_order_used = corrector_order
+                t = timesteps[step]
+                noise = torch.randn_like(x)
+
+                # predictor step
+                if skip_final_step and step == steps and not denoise_to_zero:
+                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=0,
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+                else:
+                    x_p = self.adams_bashforth_update(order=predictor_order_used, x=x, tau=tau(t),
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+
+                # evaluation step
+                # do not evaluate if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_x = self.model_fn(x_p, t)
+
+                # update model_list
+                # do not update if skip_final_step and step = steps
+                if not skip_final_step or step < steps:
+                    model_prev_list.append(model_x)
+
+                # corrector step
+                # do not correct if skip_final_step and step = steps
+                if corrector_order > 0:
+                    if not skip_final_step or step < steps:
+                        x = self.adams_moulton_update(order=corrector_order_used, x=x, tau=tau(t),
+                                                      model_prev_list=model_prev_list, t_prev_list=t_prev_list,
+                                                      noise=noise, t=t)
+                    else:
+                        x = x_p
+                else:
+                    x = x_p
+
+                # evaluation step if mode = pece and step != steps
+                if corrector_order > 0 and (pc_mode == 'PECE' and step < steps):
+                    model_x = self.model_fn(x, t)
+                    del model_prev_list[-1]
+                    model_prev_list.append(model_x)
+
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step)
+                if return_intermediate:
+                    intermediates.append(x)
+
+                t_prev_list.append(t)
+                del model_prev_list[0]
+
+            if denoise_to_zero:
+                t = torch.ones((1,)).to(device) * t_0
+                x = self.denoise_to_zero_fn(x, t)
+                if self.correcting_xt_fn is not None:
+                    x = self.correcting_xt_fn(x, t, step + 1)
+                if return_intermediate:
+                    intermediates.append(x)
+        if return_intermediate:
+            return x, intermediates
+        else:
+            return x
+
+    def sample(self, mode, x, tau, steps, t_start=None, t_end=None, skip_type='time', skip_order=1, predictor_order=3,
+               corrector_order=4, pc_mode='PEC', return_intermediate=False
+               ):
+        """
+        For the PC-mode, please refer to the wiki page 
+        https://en.wikipedia.org/wiki/Predictor%E2%80%93corrector_method#PEC_mode_and_PECE_mode
+        'PEC' needs one model evaluation per step while 'PECE' needs two model evaluations
+        We recommend use pc_mode='PEC' for NFEs is limited. 'PECE' mode is only for test with sufficient NFEs.
+
+        'few_steps' mode is recommended. The differences between 'few_steps' and 'more_steps' are as below:
+        1) 'few_steps' do not correct at final step and do not denoise to zero, while 'more_steps' do these two.
+        Thus the NFEs for 'few_steps' = steps, NFEs for 'more_steps' = steps + 2
+        For most of the experiments and tasks, we find these two operations do not have much help to sample quality.
+        2) 'few_steps' use a rescaling trick as in Appendix D in SA-Solver paper https://arxiv.org/pdf/2309.05019.pdf
+        We find it will slightly improve the sample quality especially in few steps.
+        """
+        assert mode in ['few_steps', 'more_steps'], "mode must be either 'few_steps' or 'more_steps'"
+        if mode == 'few_steps':
+            return self.sample_few_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
+                                         skip_order=skip_order, predictor_order=predictor_order,
+                                         corrector_order=corrector_order, pc_mode=pc_mode,
+                                         return_intermediate=return_intermediate)
+        else:
+            return self.sample_more_steps(x=x, tau=tau, steps=steps, t_start=t_start, t_end=t_end, skip_type=skip_type,
+                                          skip_order=skip_order, predictor_order=predictor_order,
+                                          corrector_order=corrector_order, pc_mode=pc_mode,
+                                          return_intermediate=return_intermediate)
+
+
+#############################################################
+# other utility functions
+#############################################################
+
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+
+
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,) * (dims - 1)]
\ No newline at end of file
--- a/diffusion/model/t5.py
+++ b/diffusion/model/t5.py
+# -*- coding: utf-8 -*-
+import os
+import re
+import html
+import urllib.parse as ul
+
+import ftfy
+import torch
+from bs4 import BeautifulSoup
+from transformers import T5EncoderModel, AutoTokenizer
+from huggingface_hub import hf_hub_download
+
+class T5Embedder:
+
+    available_models = ['t5-v1_1-xxl']
+    bad_punct_regex = re.compile(r'['+'#®•©™&@·º½¾¿¡§~'+'\)'+'\('+'\]'+'\['+'\}'+'\{'+'\|'+'\\'+'\/'+'\*' + r']{1,}')  # noqa
+
+    def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
+                 t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120):
+        self.device = torch.device(device)
+        self.torch_dtype = torch_dtype or torch.bfloat16
+        if t5_model_kwargs is None:
+            t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
+            if use_offload_folder is not None:
+                t5_model_kwargs['offload_folder'] = use_offload_folder
+                t5_model_kwargs['device_map'] = {
+                    'shared': self.device,
+                    'encoder.embed_tokens': self.device,
+                    'encoder.block.0': self.device,
+                    'encoder.block.1': self.device,
+                    'encoder.block.2': self.device,
+                    'encoder.block.3': self.device,
+                    'encoder.block.4': self.device,
+                    'encoder.block.5': self.device,
+                    'encoder.block.6': self.device,
+                    'encoder.block.7': self.device,
+                    'encoder.block.8': self.device,
+                    'encoder.block.9': self.device,
+                    'encoder.block.10': self.device,
+                    'encoder.block.11': self.device,
+                    'encoder.block.12': 'disk',
+                    'encoder.block.13': 'disk',
+                    'encoder.block.14': 'disk',
+                    'encoder.block.15': 'disk',
+                    'encoder.block.16': 'disk',
+                    'encoder.block.17': 'disk',
+                    'encoder.block.18': 'disk',
+                    'encoder.block.19': 'disk',
+                    'encoder.block.20': 'disk',
+                    'encoder.block.21': 'disk',
+                    'encoder.block.22': 'disk',
+                    'encoder.block.23': 'disk',
+                    'encoder.final_layer_norm': 'disk',
+                    'encoder.dropout': 'disk',
+                }
+            else:
+                t5_model_kwargs['device_map'] = {'shared': self.device, 'encoder': self.device}
+
+        self.use_text_preprocessing = use_text_preprocessing
+        self.hf_token = hf_token
+        self.cache_dir = cache_dir or os.path.expanduser('~/.cache/IF_')
+        self.dir_or_name = dir_or_name
+        tokenizer_path, path = dir_or_name, dir_or_name
+        if local_cache:
+            cache_dir = os.path.join(self.cache_dir, dir_or_name)
+            tokenizer_path, path = cache_dir, cache_dir
+        elif dir_or_name in self.available_models:
+            cache_dir = os.path.join(self.cache_dir, dir_or_name)
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+                'pytorch_model.bin.index.json', 'pytorch_model-00001-of-00002.bin', 'pytorch_model-00002-of-00002.bin'
+            ]:
+                hf_hub_download(repo_id=f'DeepFloyd/{dir_or_name}', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path, path = cache_dir, cache_dir
+        else:
+            cache_dir = os.path.join(self.cache_dir, 't5-v1_1-xxl')
+            for filename in [
+                'config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer_config.json',
+            ]:
+                hf_hub_download(repo_id='DeepFloyd/t5-v1_1-xxl', filename=filename, cache_dir=cache_dir,
+                                force_filename=filename, token=self.hf_token)
+            tokenizer_path = cache_dir
+
+        print(tokenizer_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
+        self.model_max_length = model_max_length
+
+    def get_text_embeddings(self, texts):
+        texts = [self.text_preprocessing(text) for text in texts]
+
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.model_max_length,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors='pt'
+        )
+
+        text_tokens_and_mask['input_ids'] = text_tokens_and_mask['input_ids']
+        text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']
+
+        with torch.no_grad():
+            text_encoder_embs = self.model(
+                input_ids=text_tokens_and_mask['input_ids'].to(self.device),
+                attention_mask=text_tokens_and_mask['attention_mask'].to(self.device),
+            )['last_hidden_state'].detach()
+        return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device)
+
+    def text_preprocessing(self, text):
+        if self.use_text_preprocessing:
+            # The exact text cleaning as was in the training stage:
+            text = self.clean_caption(text)
+            text = self.clean_caption(text)
+            return text
+        else:
+            return text.lower().strip()
+
+    @staticmethod
+    def basic_clean(text):
+        text = ftfy.fix_text(text)
+        text = html.unescape(html.unescape(text))
+        return text.strip()
+
+    def clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub('<person>', 'person', caption)
+        # urls:
+        caption = re.sub(
+            r'\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        caption = re.sub(
+            r'\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))',  # noqa
+            '', caption)  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features='html.parser').text
+
+        # @<nickname>
+        caption = re.sub(r'@[\w\d]+\b', '', caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r'[\u31c0-\u31ef]+', '', caption)
+        caption = re.sub(r'[\u31f0-\u31ff]+', '', caption)
+        caption = re.sub(r'[\u3200-\u32ff]+', '', caption)
+        caption = re.sub(r'[\u3300-\u33ff]+', '', caption)
+        caption = re.sub(r'[\u3400-\u4dbf]+', '', caption)
+        caption = re.sub(r'[\u4dc0-\u4dff]+', '', caption)
+        caption = re.sub(r'[\u4e00-\u9fff]+', '', caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r'[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+',  # noqa
+            '-', caption)
+
+        # кавычки к одному стандарту
+        caption = re.sub(r'[`´«»“”¨]', '"', caption)
+        caption = re.sub(r'[‘’]', "'", caption)
+
+        # &quot;
+        caption = re.sub(r'&quot;?', '', caption)
+        # &amp
+        caption = re.sub(r'&amp', '', caption)
+
+        # ip adresses:
+        caption = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' ', caption)
+
+        # article ids:
+        caption = re.sub(r'\d:\d\d\s+$', '', caption)
+
+        # \n
+        caption = re.sub(r'\\n', ' ', caption)
+
+        # "#123"
+        caption = re.sub(r'#\d{1,3}\b', '', caption)
+        # "#12345.."
+        caption = re.sub(r'#\d{5,}\b', '', caption)
+        # "123456.."
+        caption = re.sub(r'\b\d{6,}\b', '', caption)
+        # filenames:
+        caption = re.sub(r'[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)', '', caption)
+
+        #
+        caption = re.sub(r'[\"\']{2,}', r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r'[\.]{2,}', r' ', caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r' ', caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r'\s+\.\s+', r' ', caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r'(?:\-|\_)')
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, ' ', caption)
+
+        caption = self.basic_clean(caption)
+
+        caption = re.sub(r'\b[a-zA-Z]{1,3}\d{3,15}\b', '', caption)  # jc6640
+        caption = re.sub(r'\b[a-zA-Z]+\d+[a-zA-Z]+\b', '', caption)  # jc6640vc
+        caption = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', caption)  # 6640vc231
+
+        caption = re.sub(r'(worldwide\s+)?(free\s+)?shipping', '', caption)
+        caption = re.sub(r'(free\s)?download(\sfree)?', '', caption)
+        caption = re.sub(r'\bclick\b\s(?:for|on)\s\w+', '', caption)
+        caption = re.sub(r'\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?', '', caption)
+        caption = re.sub(r'\bpage\s+\d+\b', '', caption)
+
+        caption = re.sub(r'\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b', r' ', caption)  # j2d1a2a...
+
+        caption = re.sub(r'\b\d+\.?\d*[xх×]\d+\.?\d*\b', '', caption)
+
+        caption = re.sub(r'\b\s+\:\s+', r': ', caption)
+        caption = re.sub(r'(\D[,\./])\b', r'\1 ', caption)
+        caption = re.sub(r'\s+', ' ', caption)
+
+        caption.strip()
+
+        caption = re.sub(r'^[\"\']([\w\W]+)[\"\']$', r'\1', caption)
+        caption = re.sub(r'^[\'\_,\-\:;]', r'', caption)
+        caption = re.sub(r'[\'\_,\-\:\-\+]$', r'', caption)
+        caption = re.sub(r'^\.\S+$', '', caption)
+
+        return caption.strip()
--- a/diffusion/model/timestep_sampler.py
+++ b/diffusion/model/timestep_sampler.py
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+import torch as th
+import torch.distributed as dist
+
+
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+
+
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+
+
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+
+    def weights(self):
+        return self._weights
+
+
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+
+        timestep_batches = [th.zeros(max_bs, device=local_ts.device) for _ in batch_sizes]
+        loss_batches = [th.zeros(max_bs, device=local_losses.device) for _ in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+
+
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
--- a/diffusion/model/utils.py
+++ b/diffusion/model/utils.py
+import os
+import sys
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+import torch.nn.functional as F
+import torch
+import torch.distributed as dist
+import re
+import math
+from collections.abc import Iterable
+from itertools import repeat
+from torchvision import transforms as T
+import random
+from PIL import Image
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+
+def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
+    assert isinstance(model, nn.Module)
+
+    def set_attr(module):
+        module.grad_checkpointing = True
+        module.fp32_attention = use_fp32_attention
+        module.grad_checkpointing_step = gc_step
+    model.apply(set_attr)
+
+
+def auto_grad_checkpoint(module, *args, **kwargs):
+    if getattr(module, 'grad_checkpointing', False):
+        if not isinstance(module, Iterable):
+            return checkpoint(module, *args, **kwargs)
+        gc_step = module[0].grad_checkpointing_step
+        return checkpoint_sequential(module, gc_step, *args, **kwargs)
+    return module(*args, **kwargs)
+
+
+def checkpoint_sequential(functions, step, input, *args, **kwargs):
+
+    # Hack for keyword-only parameter in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(kwargs))
+
+    def run_function(start, end, functions):
+        def forward(input):
+            for j in range(start, end + 1):
+                input = functions[j](input, *args)
+            return input
+        return forward
+
+    if isinstance(functions, torch.nn.Sequential):
+        functions = list(functions.children())
+
+    # the last chunk has to be non-volatile
+    end = -1
+    segment = len(functions) // step
+    for start in range(0, step * (segment - 1), step):
+        end = start + step - 1
+        input = checkpoint(run_function(start, end, functions), input, preserve_rng_state=preserve)
+    return run_function(end + 1, len(functions) - 1, functions)(input)
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+
+    return attn
+
+def mean_flat(tensor):
+    return tensor.mean(dim=list(range(1, tensor.ndim)))
+
+
+#################################################################################
+#                          Token Masking and Unmasking                          #
+#################################################################################
+def get_mask(batch, length, mask_ratio, device, mask_type=None, data_info=None, extra_len=0):
+    """
+    Get the binary mask for the input sequence.
+    Args:
+        - batch: batch size
+        - length: sequence length
+        - mask_ratio: ratio of tokens to mask
+        - data_info: dictionary with info for reconstruction
+    return:
+        mask_dict with following keys:
+        - mask: binary mask, 0 is keep, 1 is remove
+        - ids_keep: indices of tokens to keep
+        - ids_restore: indices to restore the original order
+    """
+    assert mask_type in ['random', 'fft', 'laplacian', 'group']
+    mask = torch.ones([batch, length], device=device)
+    len_keep = int(length * (1 - mask_ratio)) - extra_len
+
+    if mask_type in ['random', 'group']:
+        noise = torch.rand(batch, length, device=device)  # noise in [0, 1]
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        ids_removed = ids_shuffle[:, len_keep:]
+
+    elif mask_type in ['fft', 'laplacian']:
+        if 'strength' in data_info:
+            strength = data_info['strength']
+
+        else:
+            N = data_info['N'][0]
+            img = data_info['ori_img']
+            # 获取原图的尺寸信息
+            _, C, H, W = img.shape
+            if mask_type == 'fft':
+                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
+                reshaped_image = img.reshape((batch, -1, H // N, N, W // N, N))
+                fft_image = torch.fft.fftn(reshaped_image, dim=(3, 5))
+                # 取绝对值并求和获取频率强度
+                strength = torch.sum(torch.abs(fft_image), dim=(1, 3, 5)).reshape((batch, -1,))
+            elif type == 'laplacian':
+                laplacian_kernel = torch.tensor([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=torch.float32).reshape(1, 1, 3, 3)
+                laplacian_kernel = laplacian_kernel.repeat(C, 1, 1, 1)
+                # 对图片进行reshape，将其变为patch (3, H/N, N, W/N, N)
+                reshaped_image = img.reshape(-1, C, H // N, N, W // N, N).permute(0, 2, 4, 1, 3, 5).reshape(-1, C, N, N)
+                laplacian_response = F.conv2d(reshaped_image, laplacian_kernel, padding=1, groups=C)
+                strength = laplacian_response.sum(dim=[1, 2, 3]).reshape((batch, -1,))
+
+        # 对频率强度进行归一化，然后使用torch.multinomial进行采样
+        probabilities = strength / (strength.max(dim=1)[0][:, None]+1e-5)
+        ids_shuffle = torch.multinomial(probabilities.clip(1e-5, 1), length, replacement=False)
+        ids_keep = ids_shuffle[:, :len_keep]
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+        ids_removed = ids_shuffle[:, len_keep:]
+
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+
+    return {'mask': mask,
+            'ids_keep': ids_keep,
+            'ids_restore': ids_restore,
+            'ids_removed': ids_removed}
+
+
+def mask_out_token(x, ids_keep, ids_removed=None):
+    """
+    Mask out the tokens specified by ids_keep.
+    Args:
+        - x: input sequence, [N, L, D]
+        - ids_keep: indices of tokens to keep
+    return:
+        - x_masked: masked sequence
+    """
+    N, L, D = x.shape  # batch, length, dim
+    x_remain = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+    if ids_removed is not None:
+        x_masked = torch.gather(x, dim=1, index=ids_removed.unsqueeze(-1).repeat(1, 1, D))
+        return x_remain, x_masked
+    else:
+        return x_remain
+
+
+def mask_tokens(x, mask_ratio):
+    """
+    Perform per-sample random masking by per-sample shuffling.
+    Per-sample shuffling is done by argsort random noise.
+    x: [N, L, D], sequence
+    """
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+
+    noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+    # generate the binary mask: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], device=x.device)
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+
+    return x_masked, mask, ids_restore
+
+
+def unmask_tokens(x, ids_restore, mask_token):
+    # x: [N, T, D] if extras == 0 (i.e., no cls token) else x: [N, T+1, D]
+    mask_tokens = mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
+    x = torch.cat([x, mask_tokens], dim=1)
+    x = torch.gather(x, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]))  # unshuffle
+    return x
+
+
+# Parse 'None' to None and others to float value
+def parse_float_none(s):
+    assert isinstance(s, str)
+    return None if s == 'None' else float(s)
+
+
+#----------------------------------------------------------------------------
+# Parse a comma separated list of numbers or ranges and return a list of ints.
+# Example: '1,2,5-10' returns [1, 2, 5, 6, 7, 8, 9, 10]
+
+def parse_int_list(s):
+    if isinstance(s, list): return s
+    ranges = []
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    for p in s.split(','):
+        if m := range_re.match(p):
+            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
+        else:
+            ranges.append(int(p))
+    return ranges
+
+
+def init_processes(fn, args):
+    """ Initialize the distributed environment. """
+    os.environ['MASTER_ADDR'] = args.master_address
+    os.environ['MASTER_PORT'] = str(random.randint(2000, 6000))
+    print(f'MASTER_ADDR = {os.environ["MASTER_ADDR"]}')
+    print(f'MASTER_PORT = {os.environ["MASTER_PORT"]}')
+    torch.cuda.set_device(args.local_rank)
+    dist.init_process_group(backend='nccl', init_method='env://', rank=args.global_rank, world_size=args.global_size)
+    fn(args)
+    if args.global_size > 1:
+        cleanup()
+
+
+def mprint(*args, **kwargs):
+    """
+    Print only from rank 0.
+    """
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.barrier()
+    mprint("Done!")
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+#----------------------------------------------------------------------------
+# logging info.
+class Logger(object):
+    """
+    Redirect stderr to stdout, optionally print stdout to a file,
+    and optionally force flushing on both stdout and the file.
+    """
+
+    def __init__(self, file_name=None, file_mode="w", should_flush=True):
+        self.file = None
+
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+
+        sys.stdout = self
+        sys.stderr = self
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def write(self, text):
+        """Write text to stdout (and a file) and optionally flush."""
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+
+        if self.file is not None:
+            self.file.write(text)
+
+        self.stdout.write(text)
+
+        if self.should_flush:
+            self.flush()
+
+    def flush(self):
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+
+        self.stdout.flush()
+
+    def close(self):
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+
+        if self.file is not None:
+            self.file.close()
+
+
+class StackedRandomGenerator:
+    def __init__(self, device, seeds):
+        super().__init__()
+        self.generators = [torch.Generator(device).manual_seed(int(seed) % (1 << 32)) for seed in seeds]
+
+    def randn(self, size, **kwargs):
+        assert size[0] == len(self.generators)
+        return torch.stack([torch.randn(size[1:], generator=gen, **kwargs) for gen in self.generators])
+
+    def randn_like(self, input):
+        return self.randn(input.shape, dtype=input.dtype, layout=input.layout, device=input.device)
+
+    def randint(self, *args, size, **kwargs):
+        assert size[0] == len(self.generators)
+        return torch.stack([torch.randint(*args, size=size[1:], generator=gen, **kwargs) for gen in self.generators])
+
+
+def prepare_prompt_ar(prompt, ratios, device='cpu', show=True):
+    # get aspect_ratio or ar
+    aspect_ratios = re.findall(r"--aspect_ratio\s+(\d+:\d+)", prompt)
+    ars = re.findall(r"--ar\s+(\d+:\d+)", prompt)
+    custom_hw = re.findall(r"--hw\s+(\d+:\d+)", prompt)
+    if show:
+        print("aspect_ratios:", aspect_ratios, "ars:", ars, "hws:", custom_hw)
+    prompt_clean = prompt.split("--aspect_ratio")[0].split("--ar")[0].split("--hw")[0]
+    if len(aspect_ratios) + len(ars) + len(custom_hw) == 0 and show:
+        print( "Wrong prompt format. Set to default ar: 1. change your prompt into format '--ar h:w or --hw h:w' for correct generating")
+    if len(aspect_ratios) != 0:
+        ar = float(aspect_ratios[0].split(':')[0]) / float(aspect_ratios[0].split(':')[1])
+    elif len(ars) != 0:
+        ar = float(ars[0].split(':')[0]) / float(ars[0].split(':')[1])
+    else:
+        ar = 1.
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+    if len(custom_hw) != 0:
+        custom_hw = [float(custom_hw[0].split(':')[0]), float(custom_hw[0].split(':')[1])]
+    else:
+        custom_hw = ratios[closest_ratio]
+    default_hw = ratios[closest_ratio]
+    prompt_show = f'prompt: {prompt_clean.strip()}\nSize: --ar {closest_ratio}, --bin hw {ratios[closest_ratio]}, --custom hw {custom_hw}'
+    return prompt_clean, prompt_show, torch.tensor(default_hw, device=device)[None], torch.tensor([float(closest_ratio)], device=device)[None], torch.tensor(custom_hw, device=device)[None]
+
+
+def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int):
+    orig_hw = torch.tensor([samples.shape[2], samples.shape[3]], dtype=torch.int)
+    custom_hw = torch.tensor([int(new_height), int(new_width)], dtype=torch.int)
+
+    if (orig_hw != custom_hw).all():
+        ratio = max(custom_hw[0] / orig_hw[0], custom_hw[1] / orig_hw[1])
+        resized_width = int(orig_hw[1] * ratio)
+        resized_height = int(orig_hw[0] * ratio)
+
+        transform = T.Compose([
+            T.Resize((resized_height, resized_width)),
+            T.CenterCrop(custom_hw.tolist())
+        ])
+        return transform(samples)
+    else:
+        return samples
+
+
+def resize_and_crop_img(img: Image, new_width, new_height):
+    orig_width, orig_height = img.size
+
+    ratio = max(new_width/orig_width, new_height/orig_height)
+    resized_width = int(orig_width * ratio)
+    resized_height = int(orig_height * ratio)
+
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+
+    left = (resized_width - new_width)/2
+    top = (resized_height - new_height)/2
+    right = (resized_width + new_width)/2
+    bottom = (resized_height + new_height)/2
+
+    img = img.crop((left, top, right, bottom))
+
+    return img
+
+
+
+def mask_feature(emb, mask):
+    if emb.shape[0] == 1:
+        keep_index = mask.sum().item()
+        return emb[:, :, :keep_index, :], keep_index
+    else:
+        masked_feature = emb * mask[:, None, :, None]
+        return masked_feature, emb.shape[2]
\ No newline at end of file
--- a/diffusion/sa_sampler.py
+++ b/diffusion/sa_sampler.py
+"""SAMPLING ONLY."""
+
+import torch
+import numpy as np
+
+from diffusion.model.sa_solver import NoiseScheduleVP, model_wrapper, SASolver
+from .model import gaussian_diffusion as gd
+
+
+class SASolverSampler(object):
+    def __init__(self, model,
+                 noise_schedule="linear",
+                 diffusion_steps=1000,
+                 device='cpu',
+                 ):
+        super().__init__()
+        self.model = model
+        self.device = device
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(device)
+        betas = torch.tensor(gd.get_named_beta_schedule(noise_schedule, diffusion_steps))
+        alphas = 1.0 - betas
+        self.register_buffer('alphas_cumprod', to_torch(np.cumprod(alphas, axis=0)))
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor and attr.device != torch.device("cuda"):
+            attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+
+    @torch.no_grad()
+    def sample(self, S, batch_size, shape, conditioning=None, callback=None, normals_sequence=None, img_callback=None, quantize_x0=False, eta=0., mask=None, x0=None, temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, verbose=True, x_T=None, log_every_t=100, unconditional_guidance_scale=1., unconditional_conditioning=None, model_kwargs=None, **kwargs):
+        if model_kwargs is None:
+            model_kwargs = {}
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            elif conditioning.shape[0] != batch_size:
+                print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+
+        device = self.device
+        img = torch.randn(size, device=device) if x_T is None else x_T
+        ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
+
+        model_fn = model_wrapper(
+            self.model,
+            ns,
+            model_type="noise",
+            guidance_type="classifier-free",
+            condition=conditioning,
+            unconditional_condition=unconditional_conditioning,
+            guidance_scale=unconditional_guidance_scale,
+            model_kwargs=model_kwargs,
+        )
+
+        sasolver = SASolver(model_fn, ns, algorithm_type="data_prediction")
+
+        tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0
+
+        x = sasolver.sample(mode='few_steps', x=img, tau=tau_t, steps=S, skip_type='time', skip_order=1, predictor_order=2, corrector_order=2, pc_mode='PEC', return_intermediate=False)
+
+        return x.to(device), None
\ No newline at end of file