Initial commit

0bc22e1d · wanglch · 0bc22e1d · 0bc22e1d · 0bc22e1d · 0bc22e1d
Commit 0bc22e1d authored May 22, 2024 by wanglch
11 changed files
--- a/vary/train/train_lora_flash_attn.py
+++ b/vary/train/train_lora_flash_attn.py
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+# Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
+
+# Need to call this before importing transformers.
+from vary.utils.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+
+replace_llama_attn_with_flash_attn()
+
+# from vary.train.train import train
+from vary.train.train_lora import train
+
+if __name__ == "__main__":
+    train()
--- a/vary/train/train_opt.py
+++ b/vary/train/train_opt.py
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import logging
+import pathlib
+import torch
+import transformers
+
+from vary.train.trainer_vit_fixlr import varyTrainer
+from vary.model import *
+from vary.data import make_supervised_data_module
+from vary.utils.arguments import *
+from vary.utils.constants import *
+from vary.model.vision_encoder.sam import build_sam_vit_b
+
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=False, padding_side="right", model_max_length=training_args.model_max_length)
+
+
+    model = varyOPTForCausalLM.from_pretrained(model_args.model_name_or_path)
+
+
+
+    dtype = torch.float32
+    if training_args.fp16:
+        dtype = torch.float16
+    if training_args.bf16:
+        dtype = torch.bfloat16
+
+    vision_tower_dict = model.get_model().initialize_vision_modules(
+        vision_tower=model_args.vision_tower,
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+        freeze_vision_tower=model_args.freeze_vision_tower,
+        use_im_start_end=model_args.use_im_start_end,
+        vision_select_layer=model_args.vision_select_layer,
+        dtype=dtype,
+        device=training_args.device
+    )
+
+    model.initialize_vision_tokenizer(
+        tokenizer=tokenizer, 
+        freeze_lm_model=model_args.freeze_lm_model, 
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+        device=training_args.device,
+    )
+
+
+
+    model.to(dtype=dtype, device=training_args.device)
+
+    data_args.image_token_len = 256
+    data_args.image_processor = vision_tower_dict['image_processor']
+    data_args.image_processor_high = vision_tower_dict['image_processor_high']
+    data_args.use_im_start_end = model_args.use_im_start_end
+
+    # mixed relation, to be fixed
+    if model_args.freeze_lm_model:
+        model.requires_grad_(False)
+        for p in model.get_model().mm_projector.parameters():
+            p.requires_grad = True
+
+        for p in model.get_input_embeddings().parameters():
+            p.requires_grad = True
+
+
+        if not model_args.freeze_vision_tower:
+
+            model.get_model().vision_tower.requires_grad_(True)
+
+                
+    params_grad = [p.numel() for n, p in model.named_parameters() if p.requires_grad]
+    print(f"Number of Mapping Trainable Parameters: {sum(params_grad) / (1 << 20):.2f} M")
+
+    # params_no_grad = [n for n, p in model.named_parameters() if not p.requires_grad]
+    # if len(params_no_grad) > 0:
+    #     if training_args.fsdp is not None and len(training_args.fsdp) > 0:
+    #         if len(params_no_grad) < 10:
+    #             print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}'. format(len(params_no_grad), params_no_grad))
+    #         else:
+    #             print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}...(omitted)'. format(len(params_no_grad), ', '.join(params_no_grad[:10])))
+    #         print("[WARNING] Attempting to use FSDP with partially frozen paramters, this is experimental.")
+    #         print("[WARNING] As of 4/30/23, this feature requires PyTorch-nightly build.  See here for details: https://github.com/haotian-liu/LLaVA#experimental-use-fsdp-to-save-memory-in-pretraining")
+
+    #         from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    #         def patch_FSDP_use_orig_params(func):
+    #             def wrap_func(*args, **kwargs):
+    #                 use_orig_params = kwargs.pop('use_orig_params', True)
+    #                 return func(*args, **kwargs, use_orig_params=use_orig_params)
+    #             return wrap_func
+
+    #         FSDP.__init__ = patch_FSDP_use_orig_params(FSDP.__init__)
+
+    # interleave = True
+    data_module = make_supervised_data_module(
+        interleave=training_args.interleave, 
+        with_box=training_args.with_box, 
+        tokenizer=tokenizer, 
+        data_args=data_args
+    )
+
+    trainer = varyTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    trainer._safe_save(output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
+
--- a/vary/train/train_qwen_vary.py
+++ b/vary/train/train_qwen_vary.py
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import logging
+import pathlib
+import torch
+import transformers
+
+
+from vary.train.trainer_vit_fixlr import varyTrainer
+from vary.model import *
+from vary.data import make_supervised_data_module
+from vary.utils.arguments import *
+from vary.utils.utils import smart_tokenizer_and_embedding_resize
+from vary.model.vision_encoder.sam import build_sam_vit_b
+
+
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained("model_args.model_name_or_path", trust_remote_code=True, padding_side="right", model_max_length=training_args.model_max_length,)
+
+
+    model = varyQwenForCausalLM.from_pretrained(model_args.model_name_or_path, low_cpu_mem_usage=True, device_map='cuda')
+
+
+
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=dict(pad_token='<|endoftext|>'),
+        tokenizer=tokenizer,
+        model=model,
+        )
+
+
+    dtype = torch.float32
+    if training_args.fp16:
+        dtype = torch.float16
+    if training_args.bf16:
+        dtype = torch.bfloat16
+
+    vision_tower_dict = model.get_model().initialize_vision_modules(
+        vision_tower=model_args.vision_tower,
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+        freeze_vision_tower=model_args.freeze_vision_tower,
+        use_im_start_end=model_args.use_im_start_end,
+        vision_select_layer=model_args.vision_select_layer,
+        dtype=dtype,
+        device=training_args.device
+    )
+
+    model.initialize_vision_tokenizer(
+        tokenizer=tokenizer, 
+        freeze_lm_model=model_args.freeze_lm_model, 
+        pretrained_stage1_model=model_args.pretrained_stage1_model,
+        device=training_args.device,
+    )
+
+
+
+
+    model.to(dtype=dtype, device=training_args.device)
+
+    data_args.image_token_len = 256
+    data_args.image_processor = vision_tower_dict['image_processor']
+    data_args.image_processor_high = vision_tower_dict['image_processor_high']
+    data_args.use_im_start_end = model_args.use_im_start_end
+
+    # mixed relation, to be fixed
+    if model_args.freeze_lm_model:
+        model.requires_grad_(False)
+        for p in model.get_model().mm_projector.parameters():
+            p.requires_grad = True
+        for p in model.get_model().mm_projector_vary.parameters():
+            p.requires_grad = True
+        for p in model.get_input_embeddings().parameters():
+            p.requires_grad = True
+
+
+        if not model_args.freeze_vision_tower:
+            model.get_model().vision_tower.requires_grad_(True)
+            model.get_model().vision_tower_high.requires_grad_(True)
+
+                
+    params_grad = [p.numel() for n, p in model.named_parameters() if p.requires_grad]
+    print(f"Number of Mapping Trainable Parameters: {sum(params_grad) / (1 << 20):.2f} M")
+
+    # params_no_grad = [n for n, p in model.named_parameters() if not p.requires_grad]
+    # if len(params_no_grad) > 0:
+    #     if training_args.fsdp is not None and len(training_args.fsdp) > 0:
+    #         if len(params_no_grad) < 10:
+    #             print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}'. format(len(params_no_grad), params_no_grad))
+    #         else:
+    #             print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}...(omitted)'. format(len(params_no_grad), ', '.join(params_no_grad[:10])))
+    #         print("[WARNING] Attempting to use FSDP with partially frozen paramters, this is experimental.")
+    #         print("[WARNING] As of 4/30/23, this feature requires PyTorch-nightly build.  See here for details: https://github.com/haotian-liu/LLaVA#experimental-use-fsdp-to-save-memory-in-pretraining")
+
+    #         from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+    #         def patch_FSDP_use_orig_params(func):
+    #             def wrap_func(*args, **kwargs):
+    #                 use_orig_params = kwargs.pop('use_orig_params', True)
+    #                 return func(*args, **kwargs, use_orig_params=use_orig_params)
+    #             return wrap_func
+
+    #         FSDP.__init__ = patch_FSDP_use_orig_params(FSDP.__init__)
+
+    
+
+    data_module = make_supervised_data_module(
+        interleave=training_args.interleave, 
+        with_box=training_args.with_box, 
+        tokenizer=tokenizer, 
+        data_args=data_args
+    )
+
+    trainer = varyTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+    trainer._safe_save(output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
--- a/vary/train/trainer.py
+++ b/vary/train/trainer.py
+import os
+import torch
+import torch.nn as nn
+
+from transformers import Trainer
+from typing import Dict, Optional, Sequence
+
+
+def unwrap_model(model: nn.Module) -> nn.Module:
+    """
+    Recursively unwraps a model from potential containers (as used in distributed training).
+
+    Args:
+        model (`torch.nn.Module`): The model to unwrap.
+    """
+    # since there could be multiple levels of wrapping, unwrap recursively
+    if hasattr(model, "module"):
+        return unwrap_model(model.module)
+    else:
+        return model
+
+
+class varyTrainer(Trainer):
+
+    def _safe_save(self, output_dir: str):
+        """Collects the state dict and dump to disk."""
+        if self.deepspeed:
+            torch.cuda.synchronize()
+            self.save_model(output_dir)
+            return
+    
+        state_dict = self.model.state_dict()
+        if self.args.should_save:
+            cpu_state_dict = {
+                key: value.cpu()
+                for key, value in state_dict.items()
+            }
+            del state_dict
+            self._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            # Save the model
+            _state_dict = state_dict
+            if _state_dict is None:
+                # Only save the model itself if we are using distributed training
+                model_to_save = unwrap_model(self.model)
+                _state_dict = model_to_save.state_dict()
+
+            weight_to_save = {}
+            keys_to_match = ['mm_projector', 'embed_tokens', 'embed_in']
+            for k, v in _state_dict.items():
+                if any(key_match in k for key_match in keys_to_match):
+                    weight_to_save[k] = v
+
+            current_folder = output_dir.split('/')[-1]
+            parent_folder = os.path.dirname(output_dir)
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+
+        super(varyTrainer, self)._save(output_dir, state_dict)
--- a/vary/train/trainer_vit_fixlr.py
+++ b/vary/train/trainer_vit_fixlr.py
+import os
+import torch
+import torch.nn as nn
+
+from transformers import Trainer
+from transformers.trainer_pt_utils import get_parameter_names
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from typing import Dict, Optional, Sequence
+
+
+def unwrap_model(model: nn.Module) -> nn.Module:
+    """
+    Recursively unwraps a model from potential containers (as used in distributed training).
+
+    Args:
+        model (`torch.nn.Module`): The model to unwrap.
+    """
+    # since there could be multiple levels of wrapping, unwrap recursively
+    if hasattr(model, "module"):
+        return unwrap_model(model.module)
+    else:
+        return model
+
+
+class varyTrainer(Trainer):
+
+    def _safe_save(self, output_dir: str):
+        """Collects the state dict and dump to disk."""
+        state_dict = self.model.state_dict()
+        if self.args.should_save:
+            cpu_state_dict = {
+                key: value.cpu()
+                for key, value in state_dict.items()
+            }
+            del state_dict
+            self._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            # Save the model
+            _state_dict = state_dict
+            if _state_dict is None:
+                # Only save the model itself if we are using distributed training
+                model_to_save = unwrap_model(self.model)
+                _state_dict = model_to_save.state_dict()
+
+            weight_to_save = {}
+            keys_to_match = ['mm_projector', 'embed_tokens', 'embed_in']
+            for k, v in _state_dict.items():
+                if any(key_match in k for key_match in keys_to_match):
+                    weight_to_save[k] = v
+
+            current_folder = output_dir.split('/')[-1]
+            parent_folder = os.path.dirname(output_dir)
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+
+        super(varyTrainer, self)._save(output_dir, state_dict)
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            optimizer_grouped_parameters = [
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if 'vision_encoder' in n and n in decay_parameters and p.requires_grad
+                    ],
+                    "weight_decay": self.args.weight_decay,
+                    "lr": self.args.learning_rate,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if 'vision_encoder' in n and n not in decay_parameters and p.requires_grad],
+                    "weight_decay": 0.0,
+                    "lr": self.args.learning_rate,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if 'vision_encoder' not in n and n in decay_parameters and p.requires_grad],
+                    "weight_decay": self.args.weight_decay,
+                    "lr": self.args.learning_rate,
+                },
+                {
+                    "params": [
+                        p for n, p in opt_model.named_parameters() if 'vision_encoder' not in n and n not in decay_parameters and p.requires_grad
+                    ],
+                    "weight_decay": 0.0,
+                    "lr": self.args.learning_rate,
+                },
+            ]
+            for idx, group in enumerate(optimizer_grouped_parameters):
+                print(idx, len(group['params']), group['lr'])
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+
+        return self.optimizer
\ No newline at end of file
--- a/vary/utils/arguments.py
+++ b/vary/utils/arguments.py
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+import transformers
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    use_cache: bool = field(default=False)
+    vision_tower: Optional[str] = field(default="~/.cache/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff/")
+    freeze_vision_tower: bool = field(default=False)
+    freeze_lm_model: bool = field(default=False)
+    pretrained_stage1_model: Optional[str] = field(default=None) # mlp &/ vision tower
+    vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    use_im_start_end: bool = field(default=False)
+
+
+@dataclass
+class DataArguments:
+    datasets: str = field(default=None, metadata={"help": "combinations of the training data."})
+    sep_image_conv_front: bool = False
+    image_token_len: int = 256
+    image_aspect_ratio: str = 'square'
+    conversation_version: str = 'mpt'
+    # conversation_version: str = 'v0'
+    # conversation_version: str = 'v1'
+    # conversation_version: str = 'opt'
+    box_limit: int = 0
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    force_fsdp: bool = field(default=False)
+    interleave: bool = field(default=False)
+    with_box: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    lora_enable: bool = False
+    lora_r: int = 8
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
\ No newline at end of file
--- a/vary/utils/constants.py
+++ b/vary/utils/constants.py
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "log"
+
+IGNORE_INDEX = -100
+# DEFAULT_PAD_TOKEN = "[PAD]"
+
+DEFAULT_PAD_TOKEN = "<|endoftext|>"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_BOX_TOKEN = "<box>"
+
+DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
+
+DEFAULT_IM_START_TOKEN = '<img>'
+DEFAULT_IM_END_TOKEN = '</img>'
+
+
+ROOT_PATH = '/data/public/ucaswei/data/'
+
+CONVERSATION_DATA = {
+
+    # pair 4m
+    'laion-coco-4m': {
+        'images': '',
+        'annotations': '',
+    }, 
+
+    'cc665k': {
+        'images': "/path_to/LLaVA1.5/images/",
+        'annotations': "/path_to/LLaVA1.5/llava_v1_5_66k.json",
+    },
+
+    'pdf': {
+        'images': "",
+        'annotations': "",
+    },
+
+    'docvqa_train': {
+        'images': "",
+        'annotations': "",
+    },
+
+    'chartqa_train': {
+        'images': "",
+        'annotations': "",
+    },
+
+
+
+}
\ No newline at end of file
--- a/vary/utils/conversation.py
+++ b/vary/utils/conversation.py
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+
+
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "<|im_end|>"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep + '\n'
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        if self.sep_style == SeparatorStyle.MPT:
+            if self.system:
+                ret = self.system + self.sep 
+            else:
+                ret = ''
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                # result.paste(pil_img, (0, (width - height) // 2))
+                                result.paste(pil_img)
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                # result.paste(pil_img, ((height - width) // 2, 0))
+                                result.paste(pil_img)
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        max_hw, min_hw = max(image.size), min(image.size)
+                        aspect_ratio = max_hw / min_hw
+                        max_len, min_len = 800, 400
+                        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                        longest_edge = int(shortest_edge * aspect_ratio)
+                        W, H = image.size
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    elif image_process_mode == "Resize":
+                        image = image.resize((224, 224))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.convert('RGB').save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    # image = image.resize((224, 224))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.replace('<image>', img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Give three tips for staying healthy."),
+        ("Assistant",
+            "Sure, here are three tips for staying healthy:\n"
+            "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
+            "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
+            "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
+            "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
+            "activities at least two days per week.\n"
+            "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
+            "vegetables, whole grains, lean proteins, and healthy fats can help support "
+            "your overall health. Try to limit your intake of processed and high-sugar foods, "
+            "and aim to drink plenty of water throughout the day.\n"
+            "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
+            "and mental health. Adults should aim for seven to nine hours of sleep per night. "
+            "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
+            "help improve the quality of your sleep.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_v1_2 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+You should follow the instructions carefully and explain your answers in detail.""",
+    # system = None,
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_mpt_eval = Conversation(
+    system="",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_mpt_text = Conversation(
+    system="""<|im_start|>system
+- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_bair_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+
+
+
+simple_conv = Conversation(
+    system="",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+simple_conv_multimodal = Conversation(
+    system="You are vary, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    # system="",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+simple_conv_mpt_multimodal = Conversation(
+    system="""<|im_start|>system
+- You are vary, a large language and vision assistant trained by Foundation Model Group, Megvii Technology.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+simple_conv_legacy = Conversation(
+    system="You are vary, a large language model trained by Foundation Model Group, Megvii Technology."
+           "You are designed to assist human with a variety of tasks using natural language."
+           "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!\n\n### Response:"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v1 = Conversation(
+    system="You are vary, a large language and vision assistant trained by Foundation Model Group, Megvii Technology."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+default_conversation = conv_mpt
+conv_templates = {
+    "default": simple_conv_multimodal,
+    "simple": simple_conv,
+    "simple_legacy": simple_conv_legacy,
+    "multimodal": simple_conv,
+    "mpt_multimodal": simple_conv_mpt_multimodal,
+    "llava_v1": conv_llava_v1,
+    "mpt_eval": conv_mpt_eval,
+    # fastchat
+    "v1": conv_vicuna_v1_1,
+    "baichuan": conv_vicuna_v1_1,
+    "bair_v1": conv_bair_v1,
+    "vicuna_v1_1": conv_vicuna_v1_1,
+    "mpt": conv_mpt,
+    "qwen": conv_mpt,
+    "mpt_text": conv_mpt_text,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
--- a/vary/utils/llama_flash_attn_monkey_patch.py
+++ b/vary/utils/llama_flash_attn_monkey_patch.py
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+from einops import rearrange
+
+from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+            Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    
+    attention_mask: [bsz, q_len]
+    """
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states).view(
+        bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(
+        bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(
+        bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+
+    kv_seq_len = key_states.shape[-2]
+    offset = 0
+    if past_key_value is not None:
+        offset = past_key_value[0].shape[-2]
+        kv_seq_len += offset
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                    key_states,
+                                                    cos,
+                                                    sin,
+                                                    offset=offset)
+    # [bsz, nh, t, hd]
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+    assert past_key_value is None, "past_key_value is not supported"
+
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+
+    # transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+
+
+    if key_padding_mask is None:
+        qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+        max_s = q_len
+        cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32,
+                                device=qkv.device)
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0,
+            softmax_scale=None, causal=True
+        )
+        output = rearrange(output, '(b s) ... -> b s ...', b=bsz)
+    else:
+        nheads = qkv.shape[-2]
+        x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            x_unpad, cu_q_lens, max_s, 0.0,
+            softmax_scale=None, causal=True
+        )
+        output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                    indices, bsz, q_len),
+                        'b s (h d) -> b s h d', h=nheads)
+    return self.o_proj(rearrange(output,
+                                    'b s h d -> b s (h d)')), None, None
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
+                                    inputs_embeds, past_key_values_length):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def replace_llama_attn_with_flash_attn():
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/vary/utils/utils.py
+++ b/vary/utils/utils.py
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import torch
+import requests
+
+from transformers import StoppingCriteria
+from vary.utils.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
+        self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
+        self.tokenizer = tokenizer
+        self.start_len = None
+        self.input_ids = input_ids
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.start_len is None:
+            self.start_len = self.input_ids.shape[1]
+        else:
+            for keyword_id in self.keyword_ids:
+                if output_ids[0, -1] == keyword_id:
+                    return True
+            outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
+            for keyword in self.keywords:
+                if keyword in outputs:
+                    return True
+        return False
+
+
+def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    # num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    # # num_new_tokens = 1
+    # # tokenizer.add_tokens(special_tokens_dict, special_tokens=True)
+    # model.resize_token_embeddings(len(tokenizer))
+
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+    
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, name=k) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls) and 'vision_model' not in name and 'mm_projector' not in name and 'vision_encoder' not in name and 'conv_final' not in name and'lm_head' not in name:
+            lora_module_names.add(name)
+
+    print(lora_module_names)
+    return list(lora_module_names)
\ No newline at end of file
--- a/zero_config/zero2.json
+++ b/zero_config/zero2.json
+{
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file