Initial commit

b81b2f59 · wanglch · f7c86e68 · f7c86e68 · f7c86e68 · f7c86e68
Commit b81b2f59 authored Jul 03, 2024 by wanglch
20 changed files
--- a/data_generation/grit/third_party/CenterNet2/docs/notes/contributing.md
+++ b/data_generation/grit/third_party/CenterNet2/docs/notes/contributing.md
-../../.github/CONTRIBUTING.md
\ No newline at end of file
--- a/data_generation/grit/third_party/CenterNet2/docs/tutorials/builtin_datasets.md
+++ b/data_generation/grit/third_party/CenterNet2/docs/tutorials/builtin_datasets.md
-../../datasets/README.md
\ No newline at end of file
--- a/data_generation/grit/third_party/CenterNet2/docs/tutorials/getting_started.md
+++ b/data_generation/grit/third_party/CenterNet2/docs/tutorials/getting_started.md
-../../GETTING_STARTED.md
\ No newline at end of file
--- a/data_generation/grit/third_party/CenterNet2/docs/tutorials/install.md
+++ b/data_generation/grit/third_party/CenterNet2/docs/tutorials/install.md
-../../INSTALL.md
\ No newline at end of file
--- a/demo.py
+++ b/demo.py
-
-from argparse import ArgumentParser
-from pathlib import Path
-
-import copy
-import gradio as gr
-import os
-import re
-import secrets
-import tempfile
-
-from PIL import Image
-from monkey_model.modeling_monkey import MonkeyLMHeadModel
-from monkey_model.tokenization_qwen import QWenTokenizer
-from monkey_model.configuration_monkey import MonkeyConfig
-
-import shutil
-from pathlib import Path
-import json
-DEFAULT_CKPT_PATH = '/home/zhangli/demo/'
-BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
-title_markdown = ("""
-# Welcome to Monkey
-
-Hello! I'm Monkey, a Large Language and Vision Assistant. Before talking to me, please read the **Operation Guide** and **Terms of Use**.
-你好！我是Monkey，一个大型语言和视觉助理。在与我交谈之前，请阅读**操作指南**和**使用条款**。
-## Operation Guide 操作指南
-
-Click the **Upload** button to upload an image. Then, you can get Monkey's answer in two ways:点击**Upload**上传图像。你可以通过两种方式得到Monkey的回答：
- - Click the **Generate** and Monkey will generate a description of the image. 点击**Generate**，Monkey将生成图像的描述。
- - Enter the question in the dialog box, click the **Submit**, and Monkey will answer the question based on the image. 在对话框中输入问题，点击**Submit**，Monkey会根据图片回答问题。
- - Click **Clear History** to clear the current image and Q&A content.点击**Clear History**，清除当前图片和问答内容。
-> Note: Monkey does not have a multi-round dialogue function. Perhaps we will further develop its capabilities in the future. 注意：Monkey没有多轮对话功能，或许我们在未来会进一步开发它的能力。
-> Monkey支持中文,但使用英文提问会比使用中文效果明显好.""")
-
-policy_markdown = ("""
-## Terms of Use
-
-By using this service, users are required to agree to the following terms:
-
- - Monkey is for research use only and unauthorized commercial use is prohibited. For any query, please contact the author.
- - Monkey's generation capabilities are limited, so we recommend that users do not rely entirely on its answers.
- - Monkey's security measures are limited, so we cannot guarantee that the output is completely appropriate. We strongly recommend that users do not intentionally guide Monkey to generate harmful content, including hate speech, discrimination, violence, pornography, deception, etc.
-
-""")
-def _get_args():
-    parser = ArgumentParser()
-    parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
-                        help="Checkpoint name or path, default to %(default)r")
-    parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
-
-    parser.add_argument("--share", action="store_true", default=False,
-                        help="Create a publicly shareable link for the interface.")
-    parser.add_argument("--inbrowser", action="store_true", default=False,
-                        help="Automatically launch the interface in a new tab on the default browser.")
-    parser.add_argument("--server-port", type=int, default=8000,
-                        help="Demo server port.")
-    parser.add_argument("--server-name", type=str, default="127.0.0.1",
-                        help="Demo server name.")
-
-    args = parser.parse_args()
-    return args
-
-
-def _load_model_tokenizer(args):
-    tokenizer = QWenTokenizer.from_pretrained(
-        args.checkpoint_path, trust_remote_code=True)
-
-    if args.cpu_only:
-        device_map = "cpu"
-    else:
-        device_map = "cuda"
-
-    model = MonkeyLMHeadModel.from_pretrained(
-        args.checkpoint_path,
-        device_map=device_map,
-        trust_remote_code=True,
-    ).eval()
-    # model.generation_config = GenerationConfig.from_pretrained(
-    #     args.checkpoint_path, trust_remote_code=True, resume_download=True,
-    # )
-    tokenizer.padding_side = 'left'
-    tokenizer.pad_token_id = tokenizer.eod_id
-    return model, tokenizer
-
-
-def _parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line != ""]
-    count = 0
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                lines[i] = f'<pre><code class="language-{items[-1]}">'
-            else:
-                lines[i] = f"<br></code></pre>"
-        else:
-            if i > 0:
-                if count % 2 == 1:
-                    line = line.replace("`", r"\`")
-                    line = line.replace("<", "&lt;")
-                    line = line.replace(">", "&gt;")
-                    line = line.replace(" ", "&nbsp;")
-                    line = line.replace("*", "&ast;")
-                    line = line.replace("_", "&lowbar;")
-                    line = line.replace("-", "&#45;")
-                    line = line.replace(".", "&#46;")
-                    line = line.replace("!", "&#33;")
-                    line = line.replace("(", "&#40;")
-                    line = line.replace(")", "&#41;")
-                    line = line.replace("$", "&#36;")
-                lines[i] = "<br>" + line
-    text = "".join(lines)
-    return text
-
-
-def _launch_demo(args, model, tokenizer):
-    def predict(_chatbot, task_history):
-        chat_query = _chatbot[-1][0]
-        query = task_history[-1][0]
-        question =  _parse_text(query)
-        print("User: " + _parse_text(query))
-        full_response = ""
-
-
-        img_path = _chatbot[0][0][0]
-        try:
-            Image.open(img_path)
-        except:
-            response = "Please upload a picture."
-            _chatbot[-1] = (_parse_text(chat_query), response)
-            full_response = _parse_text(response)
-
-            task_history[-1] = (query, full_response)
-            print("Monkey: " + _parse_text(full_response))
-            return _chatbot
-
-        query = f'<img>{img_path}</img> {question} Answer: '
-        print(query)
-
-        input_ids = tokenizer(query, return_tensors='pt', padding='longest')
-        attention_mask = input_ids.attention_mask
-        input_ids = input_ids.input_ids
-        
-        pred = model.generate(
-            input_ids=input_ids.cuda(),
-            attention_mask=attention_mask.cuda(),
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=512,
-            min_new_tokens=1,
-            length_penalty=1,
-            num_return_sequences=1,
-            output_hidden_states=True,
-            use_cache=True,
-            pad_token_id=tokenizer.eod_id,
-            eos_token_id=tokenizer.eod_id,
-            )
-        response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
-
-        _chatbot[-1] = (_parse_text(chat_query), response)
-        full_response = _parse_text(response)
-        task_history[-1] = (query, full_response)
-        print("Monkey: " + _parse_text(full_response))
-        return _chatbot
-    
-    def caption(_chatbot, task_history):
-
-        
-        query = "Generate the detailed caption in English:"
-        chat_query = "Generate the detailed caption in English:"
-        question =  _parse_text(query)
-        print("User: " + _parse_text(query))
-
-        full_response = ""
-        
-        try:
-            img_path = _chatbot[0][0][0]
-            Image.open(img_path)
-        except:
-            response = "Please upload a picture."
-
-            _chatbot.append((None, response))
-            full_response = _parse_text(response)
-
-            task_history.append((None, full_response))
-            print("Monkey: " + _parse_text(full_response))
-            return _chatbot
-        img_path = _chatbot[0][0][0]
-        query = f'<img>{img_path}</img> {chat_query} '
-        print(query)
-        input_ids = tokenizer(query, return_tensors='pt', padding='longest')
-        attention_mask = input_ids.attention_mask
-        input_ids = input_ids.input_ids
-        
-
-        pred = model.generate(
-            input_ids=input_ids.cuda(),
-            attention_mask=attention_mask.cuda(),
-            do_sample=True,
-            temperature=0.7,
-            max_new_tokens=250,
-            min_new_tokens=1,
-            length_penalty=1,
-            num_return_sequences=1,
-            output_hidden_states=True,
-            use_cache=True,
-            pad_token_id=tokenizer.eod_id,
-            eos_token_id=tokenizer.eod_id,
-            )
-        response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
-
-        
-        _chatbot.append((None, response))
-        full_response = _parse_text(response)
-
-        task_history.append((None, full_response))
-        print("Monkey: " + _parse_text(full_response))
-        return _chatbot
-   
-
-
-    def add_text(history, task_history, text):
-        task_text = text
-        if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
-            task_text = text[:-1]
-        history = history + [(_parse_text(text), None)]
-        task_history = task_history + [(task_text, None)]
-        print(history, task_history, text)
-        return history, task_history, ""
-
-    def add_file(history, task_history, file):
-        history =  [((file.name,), None)]
-        task_history = [((file.name,), None)]
-        print(history, task_history, file)
-        return history, task_history
-
-    def reset_user_input():
-        return gr.update(value="")
-
-    def reset_state(task_history):
-        task_history.clear()
-        return []
-
-
-    with gr.Blocks() as demo:
-        gr.Markdown(title_markdown)
-
-        chatbot = gr.Chatbot(label='Monkey', elem_classes="control-height", height=600,avatar_images=("https://ooo.0x0.ooo/2023/11/09/OehsLx.png","https://ooo.0x0.ooo/2023/11/09/OehGBC.png"),layout="bubble",bubble_full_width=False,show_copy_button=True)
-        query = gr.Textbox(lines=1, label='Input')
-        task_history = gr.State([])
-
-        with gr.Row():
-            empty_bin = gr.Button("Clear History (清空)")
-            submit_btn = gr.Button("Submit (提问)")
-            
-            generate_btn_en = gr.Button("Generate")
-            addfile_btn = gr.UploadButton("Upload (上传图片)", file_types=["image"])
-
-        submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
-            predict, [chatbot, task_history], [chatbot], show_progress=True
-        )
-        generate_btn_en.click(caption, [chatbot, task_history], [chatbot], show_progress=True)
-        
-        submit_btn.click(reset_user_input, [], [query])
-        empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
-        
-        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True,scroll_to_output=True)
-        
-
-
-        gr.Markdown(policy_markdown)
-    demo.queue().launch(
-        server_name="0.0.0.0",
-        server_port=7681
-    )
-
-
-def main():
-    args = _get_args()
-
-    model, tokenizer = _load_model_tokenizer(args)
-    _launch_demo(args, model, tokenizer)
-
-
-if __name__ == '__main__':
-    main()
--- a/demo_textmonkey.py
+++ b/demo_textmonkey.py
@@ -5,6 +5,7 @@ from monkey_model.modeling_textmonkey import TextMonkeyLMHeadModel
 from monkey_model.tokenization_qwen import QWenTokenizer
 from monkey_model.configuration_monkey import MonkeyConfig
 from argparse import ArgumentParser
+import torch

 def _get_args():
    parser = ArgumentParser()
@@ -21,7 +22,7 @@ def _get_args():
    return args
 args = _get_args()
 checkpoint_path = args.checkpoint_path
-device_map = "cuda"
+device_map = "auto"
 # Create model
 config = MonkeyConfig.from_pretrained(
        checkpoint_path,
@@ -73,7 +74,7 @@ def inference(input_str, input_image):
    pred = model.generate(
    input_ids=input_ids.cuda(),
    attention_mask=attention_mask.cuda(),
-    do_sample=False,
+    do_sample=True,
    num_beams=1,
    max_new_tokens=2048,
    min_new_tokens=1,
@@ -86,7 +87,7 @@ def inference(input_str, input_image):
    )
    response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=False).strip()
    image = Image.open(input_image).convert("RGB").resize((1000,1000))
-    font = ImageFont.truetype('NimbusRoman-Regular.otf', 22)
+    font = ImageFont.load_default()  # 使用系统默认字体
    bboxes = re.findall(r'<box>(.*?)</box>', response, re.DOTALL)
    refs = re.findall(r'<ref>(.*?)</ref>', response, re.DOTALL)
    if len(refs)!=0:

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
+ENV DEBIAN_FRONTEND=noninteractive
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+transformers==4.32.0
+accelerate
+tiktoken
+einops
+einops_exts
+transformers_stream_generator==0.0.4
+scipy
+pillow
+tensorboard
+matplotlib
+deepspeed
+gradio
+peft
\ No newline at end of file
--- a/finetune/ds_config_zero3.json
+++ b/finetune/ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/finetune/finetune_textmonkey.sh
+++ b/finetune/finetune_textmonkey.sh
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
+
 DIR=`pwd`

-GPUS_PER_NODE=8
+CUDA_VISIBLE_DEVICES=2,3
+
+GPUS_PER_NODE=1
 NNODES=1
 NODE_RANK=0
 MASTER_ADDR=localhost
-MASTER_PORT=6001
-
-MODEL="Qwen/Qwen-VL" # We use the first version of Qwen-VL
+MASTER_PORT=29502
+MODEL="/home/wanglch/projects/TextMonkey/TextMonkey_base" # We use the first version of Qwen-VL
 # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 # See the section for finetuning in README for more information.
-DATA="pathto/data"
+DATA="/home/wanglch/projects/TextMonkey/Monkey/data/data.json"

 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
@@ -20,12 +22,11 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
    --master_port $MASTER_PORT"


-torchrun $DISTRIBUTED_ARGS finetune_multitask_dialouge_doc.py\
+torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py\
    --model_name_or_path $MODEL \
    --data_path $DATA \
-    --bf16 True \
    --fix_vit True \
-    --output_dir output_model \
+    --output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_dcu \
    --num_train_epochs 1 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 1 \
@@ -44,7 +45,7 @@ torchrun $DISTRIBUTED_ARGS finetune_multitask_dialouge_doc.py\
    --model_max_length 2048 \
    --gradient_checkpointing \
    --lazy_preprocess True \
-    --deepspeed finetune/ds_config_zero2.json \
+    --deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
    --image_size 896 \
    --image_width 896 \
    --image_height 896 \

--- a/finetune/finetune_textmonkey_gpu.sh
+++ b/finetune/finetune_textmonkey_gpu.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DIR=`pwd`
+
+CUDA_VISIBLE_DEVICES=3,5,6,7
+
+GPUS_PER_NODE=4
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=29517
+
+MODEL="/home/wanglch/projects/TextMonkey/TextMonkey_base" # We use the first version of Qwen-VL
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="/home/wanglch/projects/TextMonkey/Monkey/data/data.json"
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT"
+
+torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py\
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --fp16 True \
+    --fix_vit True \
+    --output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_gpu \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.02 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --gradient_checkpointing \
+    --lazy_preprocess True \
+    --deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
+    --image_size 896 \
+    --image_width 896 \
+    --image_height 896 \
+    --add_window true \
+    --use_global true \
+    --resampler true  \
+    --use_lora True \
+    --remain 512
+
+
+
+
--- a/finetune_multitask_dialouge_doc.py
+++ b/finetune_multitask_dialouge_doc.py
@@ -24,7 +24,7 @@ IGNORE_TOKEN_ID = LabelSmoother.ignore_index

 @dataclass
 class ModelArguments:
-    model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
+    model_name_or_path: Optional[str] = field(default="/home/wanglch/projects/TextMonkey/TextMonkey_base")


 @dataclass
@@ -330,7 +330,7 @@ def train():

    # Set RoPE scaling factor
    config = MonkeyConfig.from_pretrained(
-        "monkey_model",
+        "/home/wanglch/projects/TextMonkey/TextMonkey_base",
        cache_dir=training_args.cache_dir,
        trust_remote_code=True,
    )
@@ -362,7 +362,7 @@ def train():
    )

    tokenizer = QWenTokenizer.from_pretrained(
-        "monkey_model",
+        "/home/wanglch/projects/TextMonkey/TextMonkey_base",
        cache_dir=training_args.cache_dir,
        model_max_length=training_args.model_max_length,
        padding_side="right",
@@ -402,21 +402,23 @@ def train():
            model.lm_head.requires_grad_(False)

    if training_args.use_lora:
-        if lora_args.q_lora or "chat" in model_args.model_name_or_path.lower():
-            modules_to_save = None
-        else:
-            modules_to_save = []
-        lora_config = LoraConfig(
-            r=lora_args.lora_r,
-            lora_alpha=lora_args.lora_alpha,
-            target_modules=lora_args.lora_target_modules,
-            lora_dropout=lora_args.lora_dropout,
-            bias=lora_args.lora_bias,
-            task_type="CAUSAL_LM",
-            modules_to_save=modules_to_save  # This argument serves for adding new tokens.
-        )
+        model.transformer.requires_grad_(False)
+        model.lm_head.requires_grad_(False)
+        model.transformer.visual.requires_grad_(False)
+
+        if hasattr(model.transformer.visual, 'attn_pool'):
+            model.transformer.visual.attn_pool.requires_grad_(True)
+        # only keep the gradient of lora and resampler module
+        for k, v in model.named_parameters():
+            if "lora" in k:
+                v.requires_grad_(True)
+        for k, v in model.named_parameters():
+            if "window_attention" in k:
+                v.requires_grad_(True)

-        model = get_peft_model(model, lora_config)
+        if training_args.fix_llm and hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
+            model.transformer.h.requires_grad_(False)
+            model.transformer.wte.requires_grad_(False)

        if training_args.gradient_checkpointing:
            model.enable_input_require_grads()

--- a/finetune_textmonkey_dcu.sh
+++ b/finetune_textmonkey_dcu.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DIR=`pwd`
+
+CUDA_VISIBLE_DEVICES=2,3
+
+
+GPUS_PER_NODE=2
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=29519
+
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT"
+
+torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py \
+    --model_name_or_path /home/wanglch/projects/TextMonkey/TextMonkey_base \
+    --data_path /home/wanglch/projects/TextMonkey/Monkey/data/data.json \
+    --fp16 True \
+    --fix_vit True \
+    --fix_llm True \
+    --output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_dcu \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.02 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --gradient_checkpointing \
+    --lazy_preprocess True \
+    --deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
+    --image_size 896 \
+    --image_width 896 \
+    --image_height 896 \
+    --add_window true \
+    --use_global true \
+    --resampler true  \
+    --use_lora True \
+    --remain 512
+
+
+
+
--- a/flagged/log.csv
+++ b/flagged/log.csv
+Question,Input Image,flag,username,timestamp
+Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/b445973ef3af610d7fd782de98c9024c87fa6089/tmpzy2ualgz.jpg,,,2024-06-25 08:04:24.668143
+OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/a1dd742177e84e78950310040d01161b92346156/tmpvn3mfx6n.jpg,,,2024-06-25 08:09:46.739416
+OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/9bdfec8fa848fbd1b77f92775f7fa599ad5b8bb6/tmpyiolfgnr.jpg,,,2024-06-25 08:15:40.416458
+OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/b58a30506cef8141a652187c8226fd11ef31c2b2/tmp0xwrftr0.png,,,2024-06-25 08:19:13.998547
+Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/e1893e80604de8d93761c9c086931017491e6206/tmpyoeac1nf.jpg,,,2024-06-25 09:36:28.584080
+ocr这张图片的文字信息，并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/69feaa2646e24ab2e5f57b9fe172462aa9ed5fca/tmpyggw5lgf.jpg,,,2024-06-25 09:42:04.567537
+这是什么,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/456dce7b71ced3cc7333304165fb1d64f76a20bf/tmph__9e7j9.jpg,,,2024-06-25 09:45:15.589882
+ocr这张图片的文字信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/3a31676bd1d1d8674d4ddfec28ce9fbb88817864/tmpk_5ah8qz.jpg,,,2024-06-25 09:49:44.657171
+Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/00b0f410c05a56bea4ef0f10faac96f02c510e85/tmpdgp55_39.jpg,,,2024-06-25 09:53:14.966818
+Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/fc2458835e72faaaba997e88edf0ce68033d3f81/tmp7m9oees7.jpg,,,2024-06-25 09:53:16.438519
+OCR with grounding:,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/088a928701d360040ccb3973ef23255894f686ca/tmpnwp_sbet.jpg,,,2024-06-25 09:56:51.982646
+Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/12662d0af4fe2d7db3a9d18bfb4196260efe36db/tmpo17kbd8t.jpg,,,2024-06-25 10:02:04.521209
+OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/c1f53d5707fd1db35a0583cee3c86a419db02a20/tmpl8qyqanw.jpg,,,2024-06-25 10:02:52.364145
+Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/f40f171969a76d98e17b86ca3745df2e0db81f58/tmpzn5c5psa.jpg,,,2024-06-25 10:03:28.890421
+ocr这张图片中的文字信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ae81de303bdc16ea3756317e5d41889a414679e1/tmpq1ofrhsz.jpg,,,2024-06-25 10:06:39.108823
+ Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/34e919ffd1a492e7545d772efcbada1725595604/tmpsu3amqf2.jpg,,,2024-06-25 10:09:11.675886
+ Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/5dde3e1670ee1dd00d565a8ec98cc2e952b0c149/tmp72is7k_3.jpg,,,2024-06-25 10:12:07.051628
+Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ed60ab97135937aaf18ce3e64355e74115d427ee/tmp_y9evvhz.jpg,,,2024-06-25 10:14:38.290775
+ocr收款人信息，出票金额，实际结算金额，申请人，出票行信息，并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ce8dd2355cfe0b828cbcb29c9736d91d96f068e2/tmp6d4d59vr.jpg,,,2024-06-26 08:45:54.525564
+读取收款人信息，出票金额，实际结算金额，申请人，出票行信息，并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/9180b70e5022e2382c3203f619b44bce77e2d8a6/tmpsrt32sg9.jpg,,,2024-06-26 08:47:00.371397
+收款人信息，出票金额，实际结算金额，申请人，出票行信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/1f5481e4a02e0cb417fdf97a4d452e12166922a3/tmp4dmuc013.jpg,,,2024-06-26 08:47:25.486834
+申请人是谁,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/4a04c4b87418b2cedb358296193cc28f170809ab/tmpi7ky4d4u.jpg,,,2024-06-26 08:51:40.970529
+出票行是？,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/48620f449363a74816cca64204acfaed17f2896f/tmpvye_ndzz.jpg,,,2024-06-26 08:52:30.040664
+Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/82cfba4aa08b13088c780cb23c995debf0716685/tmp1xxxovg8.jpg,,,2024-06-26 08:53:07.991437
+出票日期是什么时候,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/dbf2f3ea4e5202857c9aad2b993d575f552a048f/tmphahxx1_w.jpg,,,2024-06-26 09:02:41.378048
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 742
+# 模型名称
+modelName=text-monkey_pytorch                    
+# 模型描述
+modelDescription=多模态OCR大模型
+# 应用场景
+appScenario=推理,训练,对话问答,金融,教育,政府,交通
+# 框架类型
+frameType=pytorch
--- a/monkey_model/config.json
+++ b/monkey_model/config.json
-{
-    "architectures": [
-      "MonkeyLMHeadModel"
-    ],
-    "attn_dropout_prob": 0.0,
-    "auto_map": {
-      "AutoConfig": "configuration_qwen.QWenConfig",
-      "AutoModelForCausalLM": "modeling_monkey.MonkeyLMHeadModel"
-    },
-    "bf16": true,
-    "emb_dropout_prob": 0.0,
-    "fp16": false,
-    "fp32": false,
-    "hidden_size": 4096,
-    "initializer_range": 0.02,
-    "intermediate_size": 22016,
-    "kv_channels": 128,
-    "layer_norm_epsilon": 1e-06,
-    "max_position_embeddings": 8192,
-    "model_type": "monkey",
-    "no_bias": true,
-    "num_attention_heads": 32,
-    "num_hidden_layers": 32,
-    "onnx_safe": null,
-    "rotary_emb_base": 10000,
-    "rotary_pct": 1.0,
-    "scale_attn_weights": true,
-    "seq_length": 2048,
-    "tie_word_embeddings": false,
-    "tokenizer_type": "QWenTokenizer",
-    "torch_dtype": "bfloat16",
-    "transformers_version": "4.32.0",
-    "use_cache": false,
-    "use_dynamic_ntk": true,
-    "use_flash_attn": false,
-    "use_logn_attn": true,
-    "visual": {
-      "heads": 16,
-      "image_size": 896,
-      "image_start_id": 151857,
-      "layers": 48,
-      "mlp_ratio": 4.9231,
-      "output_dim": 4096,
-      "patch_size": 14,
-      "width": 1664,
-      "lora_repeat_num":4
-    },
-    "vocab_size": 151936
-  }
-  
\ No newline at end of file
--- a/monkey_model/configuration_monkey.py
+++ b/monkey_model/configuration_monkey.py
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from transformers import PretrainedConfig
-
-
-class MonkeyConfig(PretrainedConfig):
-    model_type = "monkey"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        emb_dropout_prob=0.0,
-        attn_dropout_prob=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        max_position_embeddings=8192,
-        scale_attn_weights=True,
-        use_cache=True,
-        bf16=False,
-        fp16=False,
-        fp32=False,
-        kv_channels=128,
-        rotary_pct=1.0,
-        rotary_emb_base=10000,
-        use_dynamic_ntk=True,
-        use_logn_attn=True,
-        use_flash_attn="auto",
-        intermediate_size=22016,
-        no_bias=True,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.emb_dropout_prob = emb_dropout_prob
-        self.attn_dropout_prob = attn_dropout_prob
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.bf16 = bf16
-        self.fp16 = fp16
-        self.fp32 = fp32
-        self.kv_channels = kv_channels
-        self.rotary_pct = rotary_pct
-        self.rotary_emb_base = rotary_emb_base
-        self.use_dynamic_ntk = use_dynamic_ntk
-        self.use_logn_attn = use_logn_attn
-        self.use_flash_attn = use_flash_attn
-        self.no_bias = no_bias
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs
-        )
--- a/monkey_model/configuration_qwen.py
+++ b/monkey_model/configuration_qwen.py
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from transformers import PretrainedConfig
-
-
-class QWenConfig(PretrainedConfig):
-    model_type = "monkey"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        emb_dropout_prob=0.0,
-        attn_dropout_prob=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        max_position_embeddings=8192,
-        scale_attn_weights=True,
-        use_cache=True,
-        bf16=False,
-        fp16=False,
-        fp32=False,
-        kv_channels=128,
-        rotary_pct=1.0,
-        rotary_emb_base=10000,
-        use_dynamic_ntk=True,
-        use_logn_attn=True,
-        use_flash_attn="auto",
-        intermediate_size=22016,
-        no_bias=True,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.emb_dropout_prob = emb_dropout_prob
-        self.attn_dropout_prob = attn_dropout_prob
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.bf16 = bf16
-        self.fp16 = fp16
-        self.fp32 = fp32
-        self.kv_channels = kv_channels
-        self.rotary_pct = rotary_pct
-        self.rotary_emb_base = rotary_emb_base
-        self.use_dynamic_ntk = use_dynamic_ntk
-        self.use_logn_attn = use_logn_attn
-        self.use_flash_attn = use_flash_attn
-        self.no_bias = no_bias
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs
-        )
--- a/monkey_model/modeling_monkey.py
+++ b/monkey_model/modeling_monkey.py
-import importlib
-import math
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.cuda.amp import autocast
-
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
-from transformers.generation.logits_process import LogitsProcessorList
-
-if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-from torch import nn
-from monkey_model.modeling_qwen import QWenModel,QWenPreTrainedModel,QWenLMHeadModel
-SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-logger = logging.get_logger(__name__)
-class MonkeyModel(QWenModel):
-    def __init__(self, config):
-        super().__init__(config)
-    
-    
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
-            bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
-            eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
-            assert (bos_pos[0] == eos_pos[0]).all()
-            img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-            images = []
-            for i, a, b in img_pos:
-                image = input_ids[i][a + 1 : b - 1].tolist()
-                image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
-                images.append(bytes(image).decode('utf-8'))
-            windows,images_448 = self.visual.encode(images)
-            patch_list = []
-            lora_idx = 0 
-            for col in windows:
-                for image_patch in col:
-                    patch_list.append(self.visual(image_patch,idx=lora_idx))
-                    lora_idx += 1
-                    
-            global_feat = self.visual(images_448)
-            local_feat = torch.cat(patch_list,dim=1)
-            images = torch.cat([local_feat,global_feat],dim=1)
-            assert images.shape[0] == len(images)
-        else:
-            images = None
-        return super().forward(input_ids,
-            past_key_values,
-            attention_mask,
-            token_type_ids,
-            position_ids,
-            head_mask,inputs_embeds,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            use_cache,
-            output_attentions,
-            output_hidden_states,
-            return_dict,
-            images)
-    
-
-
-
-class MonkeyLMHeadModel(QWenLMHeadModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        assert (
-            config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
-
-        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
-
-        if autoset_precision:
-            if SUPPORT_BF16:
-                logger.warn(
-                    "The model is automatically converting to bf16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.bf16 = True
-            elif SUPPORT_FP16:
-                logger.warn(
-                    "The model is automatically converting to fp16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.fp16 = True
-            else:
-                config.fp32 = True
-
-        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
-        if config.fp32:
-            if SUPPORT_BF16:
-                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
-            elif SUPPORT_FP16:
-                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        self.transformer = MonkeyModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        if config.bf16:
-            self.transformer.bfloat16()
-            self.lm_head.bfloat16()
-        if config.fp16:
-            self.transformer.half()
-            self.lm_head.half()
-        self.post_init()
-
-
--- a/monkey_model/modeling_qwen.py
+++ b/monkey_model/modeling_qwen.py