"docs/source/en/api/pipelines/unclip.md" did not exist on "830a9d1f0189d541328691c7cb20ceed829b7645"
Commit b81b2f59 authored by wanglch's avatar wanglch
Browse files

Initial commit

parent f7c86e68
../../.github/CONTRIBUTING.md
\ No newline at end of file
from argparse import ArgumentParser
from pathlib import Path
import copy
import gradio as gr
import os
import re
import secrets
import tempfile
from PIL import Image
from monkey_model.modeling_monkey import MonkeyLMHeadModel
from monkey_model.tokenization_qwen import QWenTokenizer
from monkey_model.configuration_monkey import MonkeyConfig
import shutil
from pathlib import Path
import json
DEFAULT_CKPT_PATH = '/home/zhangli/demo/'
BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
PUNCTUATION = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
title_markdown = ("""
# Welcome to Monkey
Hello! I'm Monkey, a Large Language and Vision Assistant. Before talking to me, please read the **Operation Guide** and **Terms of Use**.
你好!我是Monkey,一个大型语言和视觉助理。在与我交谈之前,请阅读**操作指南**和**使用条款**。
## Operation Guide 操作指南
Click the **Upload** button to upload an image. Then, you can get Monkey's answer in two ways:点击**Upload**上传图像。你可以通过两种方式得到Monkey的回答:
- Click the **Generate** and Monkey will generate a description of the image. 点击**Generate**,Monkey将生成图像的描述。
- Enter the question in the dialog box, click the **Submit**, and Monkey will answer the question based on the image. 在对话框中输入问题,点击**Submit**,Monkey会根据图片回答问题。
- Click **Clear History** to clear the current image and Q&A content.点击**Clear History**,清除当前图片和问答内容。
> Note: Monkey does not have a multi-round dialogue function. Perhaps we will further develop its capabilities in the future. 注意:Monkey没有多轮对话功能,或许我们在未来会进一步开发它的能力。
> Monkey支持中文,但使用英文提问会比使用中文效果明显好.""")
policy_markdown = ("""
## Terms of Use
By using this service, users are required to agree to the following terms:
- Monkey is for research use only and unauthorized commercial use is prohibited. For any query, please contact the author.
- Monkey's generation capabilities are limited, so we recommend that users do not rely entirely on its answers.
- Monkey's security measures are limited, so we cannot guarantee that the output is completely appropriate. We strongly recommend that users do not intentionally guide Monkey to generate harmful content, including hate speech, discrimination, violence, pornography, deception, etc.
""")
def _get_args():
parser = ArgumentParser()
parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
help="Checkpoint name or path, default to %(default)r")
parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
parser.add_argument("--share", action="store_true", default=False,
help="Create a publicly shareable link for the interface.")
parser.add_argument("--inbrowser", action="store_true", default=False,
help="Automatically launch the interface in a new tab on the default browser.")
parser.add_argument("--server-port", type=int, default=8000,
help="Demo server port.")
parser.add_argument("--server-name", type=str, default="127.0.0.1",
help="Demo server name.")
args = parser.parse_args()
return args
def _load_model_tokenizer(args):
tokenizer = QWenTokenizer.from_pretrained(
args.checkpoint_path, trust_remote_code=True)
if args.cpu_only:
device_map = "cpu"
else:
device_map = "cuda"
model = MonkeyLMHeadModel.from_pretrained(
args.checkpoint_path,
device_map=device_map,
trust_remote_code=True,
).eval()
# model.generation_config = GenerationConfig.from_pretrained(
# args.checkpoint_path, trust_remote_code=True, resume_download=True,
# )
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eod_id
return model, tokenizer
def _parse_text(text):
lines = text.split("\n")
lines = [line for line in lines if line != ""]
count = 0
for i, line in enumerate(lines):
if "```" in line:
count += 1
items = line.split("`")
if count % 2 == 1:
lines[i] = f'<pre><code class="language-{items[-1]}">'
else:
lines[i] = f"<br></code></pre>"
else:
if i > 0:
if count % 2 == 1:
line = line.replace("`", r"\`")
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace(" ", "&nbsp;")
line = line.replace("*", "&ast;")
line = line.replace("_", "&lowbar;")
line = line.replace("-", "&#45;")
line = line.replace(".", "&#46;")
line = line.replace("!", "&#33;")
line = line.replace("(", "&#40;")
line = line.replace(")", "&#41;")
line = line.replace("$", "&#36;")
lines[i] = "<br>" + line
text = "".join(lines)
return text
def _launch_demo(args, model, tokenizer):
def predict(_chatbot, task_history):
chat_query = _chatbot[-1][0]
query = task_history[-1][0]
question = _parse_text(query)
print("User: " + _parse_text(query))
full_response = ""
img_path = _chatbot[0][0][0]
try:
Image.open(img_path)
except:
response = "Please upload a picture."
_chatbot[-1] = (_parse_text(chat_query), response)
full_response = _parse_text(response)
task_history[-1] = (query, full_response)
print("Monkey: " + _parse_text(full_response))
return _chatbot
query = f'<img>{img_path}</img> {question} Answer: '
print(query)
input_ids = tokenizer(query, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
pred = model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=512,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id,
)
response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
_chatbot[-1] = (_parse_text(chat_query), response)
full_response = _parse_text(response)
task_history[-1] = (query, full_response)
print("Monkey: " + _parse_text(full_response))
return _chatbot
def caption(_chatbot, task_history):
query = "Generate the detailed caption in English:"
chat_query = "Generate the detailed caption in English:"
question = _parse_text(query)
print("User: " + _parse_text(query))
full_response = ""
try:
img_path = _chatbot[0][0][0]
Image.open(img_path)
except:
response = "Please upload a picture."
_chatbot.append((None, response))
full_response = _parse_text(response)
task_history.append((None, full_response))
print("Monkey: " + _parse_text(full_response))
return _chatbot
img_path = _chatbot[0][0][0]
query = f'<img>{img_path}</img> {chat_query} '
print(query)
input_ids = tokenizer(query, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
pred = model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=True,
temperature=0.7,
max_new_tokens=250,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id,
)
response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
_chatbot.append((None, response))
full_response = _parse_text(response)
task_history.append((None, full_response))
print("Monkey: " + _parse_text(full_response))
return _chatbot
def add_text(history, task_history, text):
task_text = text
if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
task_text = text[:-1]
history = history + [(_parse_text(text), None)]
task_history = task_history + [(task_text, None)]
print(history, task_history, text)
return history, task_history, ""
def add_file(history, task_history, file):
history = [((file.name,), None)]
task_history = [((file.name,), None)]
print(history, task_history, file)
return history, task_history
def reset_user_input():
return gr.update(value="")
def reset_state(task_history):
task_history.clear()
return []
with gr.Blocks() as demo:
gr.Markdown(title_markdown)
chatbot = gr.Chatbot(label='Monkey', elem_classes="control-height", height=600,avatar_images=("https://ooo.0x0.ooo/2023/11/09/OehsLx.png","https://ooo.0x0.ooo/2023/11/09/OehGBC.png"),layout="bubble",bubble_full_width=False,show_copy_button=True)
query = gr.Textbox(lines=1, label='Input')
task_history = gr.State([])
with gr.Row():
empty_bin = gr.Button("Clear History (清空)")
submit_btn = gr.Button("Submit (提问)")
generate_btn_en = gr.Button("Generate")
addfile_btn = gr.UploadButton("Upload (上传图片)", file_types=["image"])
submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
predict, [chatbot, task_history], [chatbot], show_progress=True
)
generate_btn_en.click(caption, [chatbot, task_history], [chatbot], show_progress=True)
submit_btn.click(reset_user_input, [], [query])
empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True,scroll_to_output=True)
gr.Markdown(policy_markdown)
demo.queue().launch(
server_name="0.0.0.0",
server_port=7681
)
def main():
args = _get_args()
model, tokenizer = _load_model_tokenizer(args)
_launch_demo(args, model, tokenizer)
if __name__ == '__main__':
main()
...@@ -5,6 +5,7 @@ from monkey_model.modeling_textmonkey import TextMonkeyLMHeadModel ...@@ -5,6 +5,7 @@ from monkey_model.modeling_textmonkey import TextMonkeyLMHeadModel
from monkey_model.tokenization_qwen import QWenTokenizer from monkey_model.tokenization_qwen import QWenTokenizer
from monkey_model.configuration_monkey import MonkeyConfig from monkey_model.configuration_monkey import MonkeyConfig
from argparse import ArgumentParser from argparse import ArgumentParser
import torch
def _get_args(): def _get_args():
parser = ArgumentParser() parser = ArgumentParser()
...@@ -21,7 +22,7 @@ def _get_args(): ...@@ -21,7 +22,7 @@ def _get_args():
return args return args
args = _get_args() args = _get_args()
checkpoint_path = args.checkpoint_path checkpoint_path = args.checkpoint_path
device_map = "cuda" device_map = "auto"
# Create model # Create model
config = MonkeyConfig.from_pretrained( config = MonkeyConfig.from_pretrained(
checkpoint_path, checkpoint_path,
...@@ -73,7 +74,7 @@ def inference(input_str, input_image): ...@@ -73,7 +74,7 @@ def inference(input_str, input_image):
pred = model.generate( pred = model.generate(
input_ids=input_ids.cuda(), input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(), attention_mask=attention_mask.cuda(),
do_sample=False, do_sample=True,
num_beams=1, num_beams=1,
max_new_tokens=2048, max_new_tokens=2048,
min_new_tokens=1, min_new_tokens=1,
...@@ -86,7 +87,7 @@ def inference(input_str, input_image): ...@@ -86,7 +87,7 @@ def inference(input_str, input_image):
) )
response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=False).strip() response = tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=False).strip()
image = Image.open(input_image).convert("RGB").resize((1000,1000)) image = Image.open(input_image).convert("RGB").resize((1000,1000))
font = ImageFont.truetype('NimbusRoman-Regular.otf', 22) font = ImageFont.load_default() # 使用系统默认字体
bboxes = re.findall(r'<box>(.*?)</box>', response, re.DOTALL) bboxes = re.findall(r'<box>(.*?)</box>', response, re.DOTALL)
refs = re.findall(r'<ref>(.*?)</ref>', response, re.DOTALL) refs = re.findall(r'<ref>(.*?)</ref>', response, re.DOTALL)
if len(refs)!=0: if len(refs)!=0:
......
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
ENV DEBIAN_FRONTEND=noninteractive
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
transformers==4.32.0
accelerate
tiktoken
einops
einops_exts
transformers_stream_generator==0.0.4
scipy
pillow
tensorboard
matplotlib
deepspeed
gradio
peft
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
#!/bin/bash #!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd` DIR=`pwd`
GPUS_PER_NODE=8 CUDA_VISIBLE_DEVICES=2,3
GPUS_PER_NODE=1
NNODES=1 NNODES=1
NODE_RANK=0 NODE_RANK=0
MASTER_ADDR=localhost MASTER_ADDR=localhost
MASTER_PORT=6001 MASTER_PORT=29502
MODEL="/home/wanglch/projects/TextMonkey/TextMonkey_base" # We use the first version of Qwen-VL
MODEL="Qwen/Qwen-VL" # We use the first version of Qwen-VL
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations. # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information. # See the section for finetuning in README for more information.
DATA="pathto/data" DATA="/home/wanglch/projects/TextMonkey/Monkey/data/data.json"
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \ --nnodes $NNODES \
...@@ -20,12 +22,11 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ ...@@ -20,12 +22,11 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
--master_port $MASTER_PORT" --master_port $MASTER_PORT"
torchrun $DISTRIBUTED_ARGS finetune_multitask_dialouge_doc.py\ torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py\
--model_name_or_path $MODEL \ --model_name_or_path $MODEL \
--data_path $DATA \ --data_path $DATA \
--bf16 True \
--fix_vit True \ --fix_vit True \
--output_dir output_model \ --output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_dcu \
--num_train_epochs 1 \ --num_train_epochs 1 \
--per_device_train_batch_size 2 \ --per_device_train_batch_size 2 \
--per_device_eval_batch_size 1 \ --per_device_eval_batch_size 1 \
...@@ -44,7 +45,7 @@ torchrun $DISTRIBUTED_ARGS finetune_multitask_dialouge_doc.py\ ...@@ -44,7 +45,7 @@ torchrun $DISTRIBUTED_ARGS finetune_multitask_dialouge_doc.py\
--model_max_length 2048 \ --model_max_length 2048 \
--gradient_checkpointing \ --gradient_checkpointing \
--lazy_preprocess True \ --lazy_preprocess True \
--deepspeed finetune/ds_config_zero2.json \ --deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
--image_size 896 \ --image_size 896 \
--image_width 896 \ --image_width 896 \
--image_height 896 \ --image_height 896 \
......
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
CUDA_VISIBLE_DEVICES=3,5,6,7
GPUS_PER_NODE=4
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=29517
MODEL="/home/wanglch/projects/TextMonkey/TextMonkey_base" # We use the first version of Qwen-VL
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="/home/wanglch/projects/TextMonkey/Monkey/data/data.json"
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT"
torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py\
--model_name_or_path $MODEL \
--data_path $DATA \
--fp16 True \
--fix_vit True \
--output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_gpu \
--num_train_epochs 2 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 10 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta2 0.95 \
--warmup_ratio 0.02 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "none" \
--model_max_length 2048 \
--gradient_checkpointing \
--lazy_preprocess True \
--deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
--image_size 896 \
--image_width 896 \
--image_height 896 \
--add_window true \
--use_global true \
--resampler true \
--use_lora True \
--remain 512
...@@ -24,7 +24,7 @@ IGNORE_TOKEN_ID = LabelSmoother.ignore_index ...@@ -24,7 +24,7 @@ IGNORE_TOKEN_ID = LabelSmoother.ignore_index
@dataclass @dataclass
class ModelArguments: class ModelArguments:
model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B") model_name_or_path: Optional[str] = field(default="/home/wanglch/projects/TextMonkey/TextMonkey_base")
@dataclass @dataclass
...@@ -330,7 +330,7 @@ def train(): ...@@ -330,7 +330,7 @@ def train():
# Set RoPE scaling factor # Set RoPE scaling factor
config = MonkeyConfig.from_pretrained( config = MonkeyConfig.from_pretrained(
"monkey_model", "/home/wanglch/projects/TextMonkey/TextMonkey_base",
cache_dir=training_args.cache_dir, cache_dir=training_args.cache_dir,
trust_remote_code=True, trust_remote_code=True,
) )
...@@ -362,7 +362,7 @@ def train(): ...@@ -362,7 +362,7 @@ def train():
) )
tokenizer = QWenTokenizer.from_pretrained( tokenizer = QWenTokenizer.from_pretrained(
"monkey_model", "/home/wanglch/projects/TextMonkey/TextMonkey_base",
cache_dir=training_args.cache_dir, cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length, model_max_length=training_args.model_max_length,
padding_side="right", padding_side="right",
...@@ -402,21 +402,23 @@ def train(): ...@@ -402,21 +402,23 @@ def train():
model.lm_head.requires_grad_(False) model.lm_head.requires_grad_(False)
if training_args.use_lora: if training_args.use_lora:
if lora_args.q_lora or "chat" in model_args.model_name_or_path.lower(): model.transformer.requires_grad_(False)
modules_to_save = None model.lm_head.requires_grad_(False)
else: model.transformer.visual.requires_grad_(False)
modules_to_save = []
lora_config = LoraConfig( if hasattr(model.transformer.visual, 'attn_pool'):
r=lora_args.lora_r, model.transformer.visual.attn_pool.requires_grad_(True)
lora_alpha=lora_args.lora_alpha, # only keep the gradient of lora and resampler module
target_modules=lora_args.lora_target_modules, for k, v in model.named_parameters():
lora_dropout=lora_args.lora_dropout, if "lora" in k:
bias=lora_args.lora_bias, v.requires_grad_(True)
task_type="CAUSAL_LM", for k, v in model.named_parameters():
modules_to_save=modules_to_save # This argument serves for adding new tokens. if "window_attention" in k:
) v.requires_grad_(True)
model = get_peft_model(model, lora_config) if training_args.fix_llm and hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
model.transformer.h.requires_grad_(False)
model.transformer.wte.requires_grad_(False)
if training_args.gradient_checkpointing: if training_args.gradient_checkpointing:
model.enable_input_require_grads() model.enable_input_require_grads()
......
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
CUDA_VISIBLE_DEVICES=2,3
GPUS_PER_NODE=2
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=29519
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT"
torchrun $DISTRIBUTED_ARGS /home/wanglch/projects/TextMonkey/Monkey/finetune_multitask_dialouge_doc.py \
--model_name_or_path /home/wanglch/projects/TextMonkey/TextMonkey_base \
--data_path /home/wanglch/projects/TextMonkey/Monkey/data/data.json \
--fp16 True \
--fix_vit True \
--fix_llm True \
--output_dir /home/wanglch/projects/saves/TextMonkey/Train_multi_dcu \
--num_train_epochs 2 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 10 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta2 0.95 \
--warmup_ratio 0.02 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "none" \
--model_max_length 2048 \
--gradient_checkpointing \
--lazy_preprocess True \
--deepspeed /home/wanglch/projects/TextMonkey/Monkey/finetune/ds_config_zero2.json \
--image_size 896 \
--image_width 896 \
--image_height 896 \
--add_window true \
--use_global true \
--resampler true \
--use_lora True \
--remain 512
Question,Input Image,flag,username,timestamp
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/b445973ef3af610d7fd782de98c9024c87fa6089/tmpzy2ualgz.jpg,,,2024-06-25 08:04:24.668143
OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/a1dd742177e84e78950310040d01161b92346156/tmpvn3mfx6n.jpg,,,2024-06-25 08:09:46.739416
OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/9bdfec8fa848fbd1b77f92775f7fa599ad5b8bb6/tmpyiolfgnr.jpg,,,2024-06-25 08:15:40.416458
OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/b58a30506cef8141a652187c8226fd11ef31c2b2/tmp0xwrftr0.png,,,2024-06-25 08:19:13.998547
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/e1893e80604de8d93761c9c086931017491e6206/tmpyoeac1nf.jpg,,,2024-06-25 09:36:28.584080
ocr这张图片的文字信息,并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/69feaa2646e24ab2e5f57b9fe172462aa9ed5fca/tmpyggw5lgf.jpg,,,2024-06-25 09:42:04.567537
这是什么,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/456dce7b71ced3cc7333304165fb1d64f76a20bf/tmph__9e7j9.jpg,,,2024-06-25 09:45:15.589882
ocr这张图片的文字信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/3a31676bd1d1d8674d4ddfec28ce9fbb88817864/tmpk_5ah8qz.jpg,,,2024-06-25 09:49:44.657171
Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/00b0f410c05a56bea4ef0f10faac96f02c510e85/tmpdgp55_39.jpg,,,2024-06-25 09:53:14.966818
Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/fc2458835e72faaaba997e88edf0ce68033d3f81/tmp7m9oees7.jpg,,,2024-06-25 09:53:16.438519
OCR with grounding:,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/088a928701d360040ccb3973ef23255894f686ca/tmpnwp_sbet.jpg,,,2024-06-25 09:56:51.982646
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/12662d0af4fe2d7db3a9d18bfb4196260efe36db/tmpo17kbd8t.jpg,,,2024-06-25 10:02:04.521209
OCR with grounding,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/c1f53d5707fd1db35a0583cee3c86a419db02a20/tmpl8qyqanw.jpg,,,2024-06-25 10:02:52.364145
Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/f40f171969a76d98e17b86ca3745df2e0db81f58/tmpzn5c5psa.jpg,,,2024-06-25 10:03:28.890421
ocr这张图片中的文字信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ae81de303bdc16ea3756317e5d41889a414679e1/tmpq1ofrhsz.jpg,,,2024-06-25 10:06:39.108823
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/34e919ffd1a492e7545d772efcbada1725595604/tmpsu3amqf2.jpg,,,2024-06-25 10:09:11.675886
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/5dde3e1670ee1dd00d565a8ec98cc2e952b0c149/tmp72is7k_3.jpg,,,2024-06-25 10:12:07.051628
Read all the text in the image.,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ed60ab97135937aaf18ce3e64355e74115d427ee/tmp_y9evvhz.jpg,,,2024-06-25 10:14:38.290775
ocr收款人信息,出票金额,实际结算金额,申请人,出票行信息,并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/ce8dd2355cfe0b828cbcb29c9736d91d96f068e2/tmp6d4d59vr.jpg,,,2024-06-26 08:45:54.525564
读取收款人信息,出票金额,实际结算金额,申请人,出票行信息,并以json格式返回,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/9180b70e5022e2382c3203f619b44bce77e2d8a6/tmpsrt32sg9.jpg,,,2024-06-26 08:47:00.371397
收款人信息,出票金额,实际结算金额,申请人,出票行信息,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/1f5481e4a02e0cb417fdf97a4d452e12166922a3/tmp4dmuc013.jpg,,,2024-06-26 08:47:25.486834
申请人是谁,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/4a04c4b87418b2cedb358296193cc28f170809ab/tmpi7ky4d4u.jpg,,,2024-06-26 08:51:40.970529
出票行是?,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/48620f449363a74816cca64204acfaed17f2896f/tmpvye_ndzz.jpg,,,2024-06-26 08:52:30.040664
Read all the text in the image,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/82cfba4aa08b13088c780cb23c995debf0716685/tmp1xxxovg8.jpg,,,2024-06-26 08:53:07.991437
出票日期是什么时候,/home/wanglch/projects/TextMonkey/Monkey/flagged/Input Image/dbf2f3ea4e5202857c9aad2b993d575f552a048f/tmphahxx1_w.jpg,,,2024-06-26 09:02:41.378048
# 模型唯一标识
modelCode = 742
# 模型名称
modelName=text-monkey_pytorch
# 模型描述
modelDescription=多模态OCR大模型
# 应用场景
appScenario=推理,训练,对话问答,金融,教育,政府,交通
# 框架类型
frameType=pytorch
{
"architectures": [
"MonkeyLMHeadModel"
],
"attn_dropout_prob": 0.0,
"auto_map": {
"AutoConfig": "configuration_qwen.QWenConfig",
"AutoModelForCausalLM": "modeling_monkey.MonkeyLMHeadModel"
},
"bf16": true,
"emb_dropout_prob": 0.0,
"fp16": false,
"fp32": false,
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 22016,
"kv_channels": 128,
"layer_norm_epsilon": 1e-06,
"max_position_embeddings": 8192,
"model_type": "monkey",
"no_bias": true,
"num_attention_heads": 32,
"num_hidden_layers": 32,
"onnx_safe": null,
"rotary_emb_base": 10000,
"rotary_pct": 1.0,
"scale_attn_weights": true,
"seq_length": 2048,
"tie_word_embeddings": false,
"tokenizer_type": "QWenTokenizer",
"torch_dtype": "bfloat16",
"transformers_version": "4.32.0",
"use_cache": false,
"use_dynamic_ntk": true,
"use_flash_attn": false,
"use_logn_attn": true,
"visual": {
"heads": 16,
"image_size": 896,
"image_start_id": 151857,
"layers": 48,
"mlp_ratio": 4.9231,
"output_dim": 4096,
"patch_size": 14,
"width": 1664,
"lora_repeat_num":4
},
"vocab_size": 151936
}
\ No newline at end of file
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from transformers import PretrainedConfig
class MonkeyConfig(PretrainedConfig):
model_type = "monkey"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=151936,
hidden_size=4096,
num_hidden_layers=32,
num_attention_heads=32,
emb_dropout_prob=0.0,
attn_dropout_prob=0.0,
layer_norm_epsilon=1e-6,
initializer_range=0.02,
max_position_embeddings=8192,
scale_attn_weights=True,
use_cache=True,
bf16=False,
fp16=False,
fp32=False,
kv_channels=128,
rotary_pct=1.0,
rotary_emb_base=10000,
use_dynamic_ntk=True,
use_logn_attn=True,
use_flash_attn="auto",
intermediate_size=22016,
no_bias=True,
tie_word_embeddings=False,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.emb_dropout_prob = emb_dropout_prob
self.attn_dropout_prob = attn_dropout_prob
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.max_position_embeddings = max_position_embeddings
self.bf16 = bf16
self.fp16 = fp16
self.fp32 = fp32
self.kv_channels = kv_channels
self.rotary_pct = rotary_pct
self.rotary_emb_base = rotary_emb_base
self.use_dynamic_ntk = use_dynamic_ntk
self.use_logn_attn = use_logn_attn
self.use_flash_attn = use_flash_attn
self.no_bias = no_bias
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs
)
# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from transformers import PretrainedConfig
class QWenConfig(PretrainedConfig):
model_type = "monkey"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=151936,
hidden_size=4096,
num_hidden_layers=32,
num_attention_heads=32,
emb_dropout_prob=0.0,
attn_dropout_prob=0.0,
layer_norm_epsilon=1e-6,
initializer_range=0.02,
max_position_embeddings=8192,
scale_attn_weights=True,
use_cache=True,
bf16=False,
fp16=False,
fp32=False,
kv_channels=128,
rotary_pct=1.0,
rotary_emb_base=10000,
use_dynamic_ntk=True,
use_logn_attn=True,
use_flash_attn="auto",
intermediate_size=22016,
no_bias=True,
tie_word_embeddings=False,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.emb_dropout_prob = emb_dropout_prob
self.attn_dropout_prob = attn_dropout_prob
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.scale_attn_weights = scale_attn_weights
self.use_cache = use_cache
self.max_position_embeddings = max_position_embeddings
self.bf16 = bf16
self.fp16 = fp16
self.fp32 = fp32
self.kv_channels = kv_channels
self.rotary_pct = rotary_pct
self.rotary_emb_base = rotary_emb_base
self.use_dynamic_ntk = use_dynamic_ntk
self.use_logn_attn = use_logn_attn
self.use_flash_attn = use_flash_attn
self.no_bias = no_bias
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs
)
import importlib
import math
from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.cuda.amp import autocast
from torch.nn import CrossEntropyLoss
from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
from transformers.generation.logits_process import LogitsProcessorList
if TYPE_CHECKING:
from transformers.generation.streamers import BaseStreamer
from transformers.generation.utils import GenerateOutput
from transformers.modeling_outputs import (
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
try:
from einops import rearrange
except ImportError:
rearrange = None
from torch import nn
from monkey_model.modeling_qwen import QWenModel,QWenPreTrainedModel,QWenLMHeadModel
SUPPORT_CUDA = torch.cuda.is_available()
SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
logger = logging.get_logger(__name__)
class MonkeyModel(QWenModel):
def __init__(self, config):
super().__init__(config)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
assert (bos_pos[0] == eos_pos[0]).all()
img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
images = []
for i, a, b in img_pos:
image = input_ids[i][a + 1 : b - 1].tolist()
image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
images.append(bytes(image).decode('utf-8'))
windows,images_448 = self.visual.encode(images)
patch_list = []
lora_idx = 0
for col in windows:
for image_patch in col:
patch_list.append(self.visual(image_patch,idx=lora_idx))
lora_idx += 1
global_feat = self.visual(images_448)
local_feat = torch.cat(patch_list,dim=1)
images = torch.cat([local_feat,global_feat],dim=1)
assert images.shape[0] == len(images)
else:
images = None
return super().forward(input_ids,
past_key_values,
attention_mask,
token_type_ids,
position_ids,
head_mask,inputs_embeds,
encoder_hidden_states,
encoder_attention_mask,
use_cache,
output_attentions,
output_hidden_states,
return_dict,
images)
class MonkeyLMHeadModel(QWenLMHeadModel):
_keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
_keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
def __init__(self, config):
super().__init__(config)
assert (
config.bf16 + config.fp16 + config.fp32 <= 1
), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
if autoset_precision:
if SUPPORT_BF16:
logger.warn(
"The model is automatically converting to bf16 for faster inference. "
"If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
)
config.bf16 = True
elif SUPPORT_FP16:
logger.warn(
"The model is automatically converting to fp16 for faster inference. "
"If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
)
config.fp16 = True
else:
config.fp32 = True
if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
if config.fp32:
if SUPPORT_BF16:
logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
elif SUPPORT_FP16:
logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
self.transformer = MonkeyModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
if config.bf16:
self.transformer.bfloat16()
self.lm_head.bfloat16()
if config.fp16:
self.transformer.half()
self.lm_head.half()
self.post_init()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment