Initial commit

58d33d4c · wanglch · 58d33d4c · 58d33d4c · 58d33d4c · 58d33d4c
Commit 58d33d4c authored Nov 13, 2024 by wanglch
20 changed files
--- a/mplug_docowl/model/__pycache__/modeling_attn_mask_utils.cpython-310.pyc
+++ b/mplug_docowl/model/__pycache__/modeling_attn_mask_utils.cpython-310.pyc
--- a/mplug_docowl/model/__pycache__/modeling_llama2.cpython-310.pyc
+++ b/mplug_docowl/model/__pycache__/modeling_llama2.cpython-310.pyc
--- a/mplug_docowl/model/__pycache__/modeling_mplug_docowl.cpython-310.pyc
+++ b/mplug_docowl/model/__pycache__/modeling_mplug_docowl.cpython-310.pyc
--- a/mplug_docowl/model/__pycache__/modeling_mplug_docowl.cpython-38.pyc
+++ b/mplug_docowl/model/__pycache__/modeling_mplug_docowl.cpython-38.pyc
--- a/mplug_docowl/model/__pycache__/visual_encoder.cpython-310.pyc
+++ b/mplug_docowl/model/__pycache__/visual_encoder.cpython-310.pyc
--- a/mplug_docowl/model/builder.py
+++ b/mplug_docowl/model/builder.py
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+import os
+import warnings
+import shutil
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+from transformers.models.clip.image_processing_clip import CLIPImageProcessor
+import torch
+from mplug_docowl.model import *
+from icecream import ic
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda"):
+    kwargs = {"device_map": device_map}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if 'paperowl' or 'docowl' in model_name.lower():
+        if model_base is not None:
+            # this may be mm projector only
+            print('Loading mPLUG-DocOwl from base model...')
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            model = MPLUGDocOwlLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = MPLUGDocOwlLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    # vision_tower = model.get_model().vision_model
+    # vision_tower.to(device=device, dtype=torch.float16)
+    # image_processor = CLIPImageProcessor.from_pretrained(model_path)
+    image_processor = None
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, image_processor, context_len
\ No newline at end of file
--- a/mplug_docowl/model/configuration_mplug_docowl.py
+++ b/mplug_docowl/model/configuration_mplug_docowl.py
+# Copyright (c) Alibaba.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+            
+class MplugOwlVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MplugOwlVisionModel`]. It is used to instantiate
+    a
+     mPLUG-Owl vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+     configuration defaults will yield a similar configuration to that of the mPLUG-Owl
+     [x-plug/x_plug-llama-7b](https://huggingface.co/x-plug/x_plug-llama-7b) architecture.
+
+     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+     documentation from [`PretrainedConfig`] for more information.
+
+     Args:
+         hidden_size (`int`, *optional*, defaults to 768):
+             Dimensionality of the encoder layers and the pooler layer.
+         intermediate_size (`int`, *optional*, defaults to 3072):
+             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+         num_hidden_layers (`int`, *optional*, defaults to 12):
+             Number of hidden layers in the Transformer encoder.
+         num_attention_heads (`int`, *optional*, defaults to 12):
+             Number of attention heads for each attention layer in the Transformer encoder.
+         image_size (`int`, *optional*, defaults to 224):
+             The size (resolution) of each image.
+         patch_size (`int`, *optional*, defaults to 32):
+             The size (resolution) of each patch.
+         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+             `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+         layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+             The epsilon used by the layer normalization layers.
+         attention_dropout (`float`, *optional*, defaults to 0.0):
+             The dropout ratio for the attention probabilities.
+         initializer_range (`float`, *optional*, defaults to 0.02):
+             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+         initializer_factor (`float`, *optional*, defaults to 1):
+             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+             testing).
+
+
+     ```"""
+
+    model_type = "mplug_owl_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        projection_dim=768,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=448,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        use_flash_attn=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from MplugOwlConfig
+        if config_dict.get("model_type") == "mplug-owl":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugDocOwlHReducerConfig(PretrainedConfig):
+    model_type = "mplug_docowl_hreducer"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        conv_shape='1x4',
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.conv_shape = conv_shape
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the visual_abstractor config dict if we are loading from MplugOwlConfig
+        if config_dict.get("model_type") == "mplug-docowl":
+            config_dict = config_dict["hreducer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+DEFAULT_VISUAL_CONFIG = {
+    "visual_model": MplugOwlVisionConfig().to_dict(),
+    "visual_hreducer": MplugDocOwlHReducerConfig().to_dict()
+}
+
+class MPLUGDocOwlConfig(LlamaConfig):
+    model_type = "mplug_docowl"
+    def __init__(self, visual_config=None, **kwargs):
+        if visual_config is None:
+            self.visual_config = DEFAULT_VISUAL_CONFIG
+        else:
+            self.visual_config = visual_config
+        
+        super().__init__(
+            **kwargs,
+        )
+        
+if __name__ == "__main__":
+    print(MplugOwlVisionConfig().to_dict())
\ No newline at end of file
--- a/mplug_docowl/model/convert_mplug_docowl_weight_to_hf.py
+++ b/mplug_docowl/model/convert_mplug_docowl_weight_to_hf.py
+# Copyright 2023 DAMO Academy and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import math
+import os
+import shutil
+import warnings
+
+import torch
+
+from transformers import LlamaTokenizer
+from .configuration_mplug_docowl import MPLUGDocOwlConfig
+from icecream import ic
+
+try:
+    from transformers import LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    LlamaTokenizerFast = None
+
+"""
+Sample usage:
+
+```
+python3 /pure-mlo-scratch/sfan/model-parallel-trainer/llama2megatron/convert_llama2hf.py \
+    --input_dir /pure-mlo-scratch/llama/ --model_size 7 --output_dir /pure-mlo-scratch/llama/converted_HF_7B
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+model = LlamaForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+llama_s2layer = {7: 32, 13: 40, 30: 60, 65: 80, 70: 80}
+llama_s2heads = {7: 32, 13: 40, 30: 52, 65: 64, 70: 64}
+llama_s2dense = {7: 11008, 13: 13824, 30: 17920, 65: 22016,
+                 70: 28672}  # should be (2/3)*4*d, but it isn't exaclty that
+llama_s2hidden = {7: 4096, 13: 5120, 32: 6656, 65: 8192, 70: 8192}
+
+
+def compute_intermediate_size(n):
+    return int(math.ceil(n * 8 / 3) + 255) // 256 * 256
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, 
+                input_base_path, 
+                model_size,
+                num_input_shards=1,
+                num_output_shards=2,
+                skip_permute=True,
+                norm_eps=1e-05):
+    # if os.path.exists(model_path):
+    #     shutil.rmtree(model_path)
+    os.makedirs(model_path, exist_ok=True)
+    # tmp_model_path = os.path.join(model_path, "tmp")
+    tmp_model_path = model_path
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    num_shards = num_input_shards
+    n_layers = llama_s2layer[model_size]
+    n_heads = llama_s2heads[model_size]
+    n_heads_per_shard = n_heads // num_shards
+    n_dense = llama_s2dense[model_size]
+    n_hidden = llama_s2hidden[model_size]
+    hidden_per_head = n_hidden // n_heads
+    base = 10000.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, hidden_per_head, 2).float() / hidden_per_head))
+
+    # permute for sliced rotary
+    def permute(w, skip_permute=skip_permute):
+        if skip_permute:
+            return w
+        return w.view(n_heads, n_hidden // n_heads // 2, 2, n_hidden).transpose(1, 2).reshape(n_hidden, n_hidden)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    if num_shards==1:
+        # Not sharded
+        # (The sharded implementation would also work, but this is simpler.)
+        # /pure-mlo-scratch/alhernan/megatron-data/checkpoints/llama2-7b-tp4-pp1-optim/release/mp_rank_00/model_optim_rng.pt
+        if os.path.exists(os.path.join(input_base_path, 'release')):
+            filename = os.path.join(input_base_path, 'release', 'mp_rank_00', 'model_optim_rng.pt')
+        elif input_base_path.split('/')[-1].startswith('iter_'):
+            iteration = eval(input_base_path.split('/')[-1].replace('iter_', '').lstrip('0'))
+            load_dir = '/'.join(input_base_path.split('/')[:-1])
+            filename = os.path.join(input_base_path, 'mp_rank_00', 'model_optim_rng.pt')
+            if not os.path.exists(filename):
+                filename = filename.replace('model_optim_rng.pt', 'model_rng.pt')
+        else:
+            tracker_filename = os.path.join(input_base_path, 'latest_checkpointed_iteration.txt')
+            with open(tracker_filename, 'r') as f:
+                metastring = f.read().strip()
+            iteration = 'iter_{:07d}'.format(int(metastring))
+            filename = os.path.join(input_base_path, iteration, 'mp_rank_00', 'model_optim_rng.pt')
+        if not os.path.exists(filename):
+            filename = filename.replace('model_optim_rng.pt', 'model_rng.pt')
+        original_filename = filename
+        loaded = torch.load(filename, map_location="cpu")['model']['language_model']
+
+    else:
+        # Sharded
+        filenames = []
+        for i in range(num_shards):
+            if os.path.exists(os.path.join(input_base_path, 'release')):
+                filename = os.path.join(input_base_path, 'release', f'mp_rank_{i:02d}', 'model_optim_rng.pt')
+            else:
+                tracker_filename = os.path.join(input_base_path, 'latest_checkpointed_iteration.txt')
+                with open(tracker_filename, 'r') as f:
+                    metastring = f.read().strip()
+                iteration = 'iter_{:07d}'.format(int(metastring))
+                filename = os.path.join(input_base_path, iteration, f'mp_rank_{i:02d}', 'model_optim_rng.pt')
+            if not os.path.exists(filename):
+                filename = filename.replace('model_optim_rng.pt', 'model_rng.pt')
+            filenames.append(filename)
+        loaded = [
+            torch.load(filenames[i], map_location="cpu")['model']['language_model']
+            for i in range(num_shards)
+        ]
+
+    print('Llama-Megatron Loaded!')
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    
+    print(f'Weighted Converting for {n_layers} layers...')
+    for layer_i in range(n_layers):
+        print(layer_i)
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        if num_shards == 1:
+            # Unsharded
+            state_dict = {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.q_proj.weight"],
+                f"model.layers.{layer_i}.self_attn.k_proj.multiway.0.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.k_proj.multiway.0.weight"],
+                f"model.layers.{layer_i}.self_attn.v_proj.multiway.0.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.v_proj.multiway.0.weight"],
+                f"model.layers.{layer_i}.self_attn.k_proj.multiway.1.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.k_proj.multiway.1.weight"],
+                f"model.layers.{layer_i}.self_attn.v_proj.multiway.1.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.v_proj.multiway.1.weight"],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded['encoder'][f"layers.{layer_i}.self_attention.o_proj.weight"],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded['encoder'][f"layers.{layer_i}.mlp.gate_proj.weight"],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded['encoder'][f"layers.{layer_i}.mlp.down_proj.weight"],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded['encoder'][f"layers.{layer_i}.mlp.up_proj.weight"],
+                f"model.layers.{layer_i}.input_layernorm.multiway.0.weight": loaded['encoder'][f"layers.{layer_i}.input_layernorm.multiway.0.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.multiway.0.weight": loaded['encoder'][f"layers.{layer_i}.post_attention_layernorm.multiway.0.weight"],
+                f"model.layers.{layer_i}.input_layernorm.multiway.1.weight": loaded['encoder'][f"layers.{layer_i}.input_layernorm.multiway.1.weight"],
+                f"model.layers.{layer_i}.post_attention_layernorm.multiway.1.weight": loaded['encoder'][f"layers.{layer_i}.post_attention_layernorm.multiway.1.weight"],
+            }
+        else:
+            raise NotImplemented
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+        print(f'Sharded file saved to {filename}')
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    if num_shards==1:
+        # Unsharded
+        state_dict = {
+            "model.embed_tokens.weight": loaded['embedding']['word_embeddings']['weight'],
+            "model.norm.weight": loaded['encoder']['norm.weight'],
+            "lm_head.weight": loaded['encoder']['lm_head.weight'],
+        }
+    else:
+        state_dict = {
+            "model.embed_tokens.weight": loaded[0]['embedding']['word_embeddings']['weight'],
+            "model.norm.weight": loaded[0]['encoder']['norm.weight'],
+            "lm_head.weight": loaded[0]['encoder']['lm_head.weight'],
+        }
+        
+    
+    loaded_all = torch.load(original_filename, map_location="cpu")['model']
+    # Vision Part
+    state_dict.update({
+        "model.vision_model.embeddings.cls_token": loaded_all['vision_model']['cls_token'],
+        "model.vision_model.embeddings.patch_embed.weight": loaded_all['vision_model']['patch_embed']['weight'],
+        "model.vision_model.embeddings.position_embedding": loaded_all['vision_model']['position_embeddings'],
+        "model.vision_model.embeddings.pre_layernorm.bias": loaded_all['vision_model']['pre_layernorm']['bias'],
+        "model.vision_model.embeddings.pre_layernorm.weight": loaded_all['vision_model']['pre_layernorm']['weight'],
+        "model.vision_model.post_layernorm.bias": loaded_all['vision_model']['transformer']['final_layernorm.bias'],
+        "model.vision_model.post_layernorm.weight": loaded_all['vision_model']['transformer']['final_layernorm.weight'],
+    })
+    for v_layer_idx in range(24):
+        state_dict.update({
+            f"model.vision_model.encoder.layers.{v_layer_idx}.input_layernorm.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.input_layernorm.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.input_layernorm.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.input_layernorm.weight'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.mlp.fc1.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.mlp.dense_h_to_4h.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.mlp.fc1.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.mlp.dense_h_to_4h.weight'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.mlp.fc2.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.mlp.dense_4h_to_h.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.mlp.fc2.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.mlp.dense_4h_to_h.weight'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.post_attention_layernorm.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.post_attention_layernorm.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.post_attention_layernorm.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.post_attention_layernorm.weight'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.self_attn.dense.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.self_attention.dense.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.self_attn.dense.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.self_attention.dense.weight'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.self_attn.query_key_value.bias": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.self_attention.query_key_value.bias'],
+            f"model.vision_model.encoder.layers.{v_layer_idx}.self_attn.query_key_value.weight": loaded_all['vision_model']['transformer'][f'layers.{v_layer_idx}.self_attention.query_key_value.weight'],
+        })
+
+    # Vision2Text Part: HReducer 
+    state_dict.update({
+        "model.vision2text.ln_q.weight": loaded_all['hreducer3']['ln_q']['weight'],
+        "model.vision2text.ln_q.bias": loaded_all['hreducer3']['ln_q']['bias'],
+        "model.vision2text.visual_fc.bias": loaded_all['hreducer3']['visual_fc']['bias'],
+        "model.vision2text.visual_fc.weight": loaded_all['hreducer3']['visual_fc']['weight'],
+        "model.vision2text.vit_eos": loaded_all['hreducer3']['vit_eos'],  
+    })
+    # reducer_before conv (layer 0) + gleu  (layer 1)
+    state_dict.update({
+        f"model.vision2text.reducer_before.0.weight": loaded_all['hreducer3']['reducer_before']["0.weight"],
+        f"model.vision2text.reducer_before.0.bias": loaded_all['hreducer3']['reducer_before']["0.bias"],
+    })
+    # reducer conv 
+    state_dict.update({
+        f"model.vision2text.reducer.weight": loaded_all['hreducer3']['reducer']["weight"],
+        f"model.vision2text.reducer.bias": loaded_all['hreducer3']['reducer']["bias"],
+    })
+
+    for k, v in state_dict.items():
+        # ic(k, v)
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+
+    config = MPLUGDocOwlConfig()
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    del loaded_all
+    gc.collect()
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA_Megatron weights",
+    )
+    parser.add_argument(
+        "--model_size",
+        type=int,
+        default=7,
+        choices=[7, 13, 30, 65, 70],
+    )
+    parser.add_argument(
+        "--num_input_shards",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--num_output_shards",
+        type=int,
+        default=1,
+    )
+    parser.add_argument('--skip_permute', action='store_true')
+    
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    
+    args = parser.parse_args()
+    write_model(
+        model_path=args.output_dir,
+        input_base_path=args.input_dir,
+        model_size=args.model_size,
+        num_input_shards=args.num_input_shards,
+        num_output_shards=args.num_output_shards,
+        skip_permute=args.skip_permute
+    )
+    
+
+if __name__ == "__main__":
+    main()
--- a/mplug_docowl/model/convert_mplug_docowl_weight_to_hf_v2.py
+++ b/mplug_docowl/model/convert_mplug_docowl_weight_to_hf_v2.py
--- a/mplug_docowl/model/modeling_attn_mask_utils.py
+++ b/mplug_docowl/model/modeling_attn_mask_utils.py
--- a/mplug_docowl/model/modeling_llama2.py
+++ b/mplug_docowl/model/modeling_llama2.py
--- a/mplug_docowl/model/modeling_mplug_docowl.py
+++ b/mplug_docowl/model/modeling_mplug_docowl.py
--- a/mplug_docowl/model/utils.py
+++ b/mplug_docowl/model/utils.py
+from transformers import AutoConfig
+
+
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'mplug_owl2' in config and 'mplug_owl2' not in cfg.model_type:
+        assert cfg.model_type == 'mplug_owl2'
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "mplug_owl2")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
\ No newline at end of file
--- a/mplug_docowl/model/visual_encoder.py
+++ b/mplug_docowl/model/visual_encoder.py
--- a/mplug_docowl/processor.py
+++ b/mplug_docowl/processor.py
--- a/mplug_docowl/train/__pycache__/llama_flash_attn_monkey_patch.cpython-310.pyc
+++ b/mplug_docowl/train/__pycache__/llama_flash_attn_monkey_patch.cpython-310.pyc
--- a/mplug_docowl/train/__pycache__/mplug_docowl_trainer.cpython-310.pyc
+++ b/mplug_docowl/train/__pycache__/mplug_docowl_trainer.cpython-310.pyc
--- a/mplug_docowl/train/llama_flash_attn_monkey_patch.py
+++ b/mplug_docowl/train/llama_flash_attn_monkey_patch.py
--- a/mplug_docowl/train/mplug_docowl_trainer.py
+++ b/mplug_docowl/train/mplug_docowl_trainer.py
--- a/mplug_docowl/train/train_docowl.py
+++ b/mplug_docowl/train/train_docowl.py