Initial commit

58d33d4c · wanglch · 58d33d4c · 58d33d4c · 58d33d4c · 58d33d4c
Commit 58d33d4c authored Nov 13, 2024 by wanglch
20 changed files
--- a/DocLocal4K/.gitattributes
+++ b/DocLocal4K/.gitattributes
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.wma filter=lfs diff=lfs merge=lfs -text
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.m4a filter=lfs diff=lfs merge=lfs -text
+*.m3u8 filter=lfs diff=lfs merge=lfs -text
+*.amr filter=lfs diff=lfs merge=lfs -text
+*.audio filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.flv filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.mpg filter=lfs diff=lfs merge=lfs -text
+*.asf filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.mpeg filter=lfs diff=lfs merge=lfs -text
+*.3gp filter=lfs diff=lfs merge=lfs -text
+*.wmv filter=lfs diff=lfs merge=lfs -text
+*.rmvb filter=lfs diff=lfs merge=lfs -text
+*.rm filter=lfs diff=lfs merge=lfs -text
+*.ts filter=lfs diff=lfs merge=lfs -text
+*.mkv filter=lfs diff=lfs merge=lfs -text
+*.flash filter=lfs diff=lfs merge=lfs -text
+*.vob filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.ost filter=lfs diff=lfs merge=lfs -text
+*.pst filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
+*.ppt filter=lfs diff=lfs merge=lfs -text
+*.pptx filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.vsd filter=lfs diff=lfs merge=lfs -text
+*.vsdx filter=lfs diff=lfs merge=lfs -text
--- a/DocLocal4K/README.md
+++ b/DocLocal4K/README.md
+---
+license: Apache License 2.0
+---
+数据集文件元信息以及数据文件，请浏览“数据集文件”页面获取。
+当前数据集卡片使用的是默认模版，数据集的贡献者未提供更加详细的数据集介绍，但是您可以通过如下GIT Clone命令，或者ModelScope SDK来下载数据集
+#### 下载方法 
+:modelscope-code[]{type="sdk"}
+:modelscope-code[]{type="git"}
--- a/DocLocal4K/dataset_infos.json
+++ b/DocLocal4K/dataset_infos.json
+{"default": {"features": {"image": {"_type": "Value"}, "messages": {"_type": "Value"}, "task_name": {"_type": "Value"}, "dataset_name": {"_type": "Value"}, "id": {"_type": "Value"}}, "splits": {"train": {"name": "train", "dataset_name": "DocLocal4K"}}}}
\ No newline at end of file
--- a/DocLocal4K/mini_imges.jsonl
+++ b/DocLocal4K/mini_imges.jsonl
+{"image": ["./imgs/VisualMRC/images/sarawut.news/news__thai-immigration-chief-announces-arrest-of-wanted-british-child-sex-offender02.png"], "messages": [{"role": "user", "content": "<|image|>Detect the text in the bounding box <bbox>203,690,817,726</bbox>"}, {"role": "assistant", "content": "<ocr> despite being the subject of a court order had in </ocr>"}], "task_name": "phrase_bbox2t_sft", "dataset_name": "VisualMRC", "id": "VisualMRC_bbox2t_374"}
+{"image": ["./imgs/VisualMRC/images/pro.europeana.eu/person__aleksandra-strzelichowska04.png"], "messages": [{"role": "user", "content": "<|image|>Recognize the text in the bounding box <bbox>560,494,913,649</bbox>"}, {"role": "assistant", "content": "<ocr> Network Cultural Heritage Institutions Berlin \n How to get young people interested in </ocr>"}], "task_name": "block_bbox2t_sft", "dataset_name": "VisualMRC", "id": "VisualMRC_bbox2t_650"}
+{"image": ["./imgs/VisualMRC/images/pro.europeana.eu/person__max-kaiser03.png"], "messages": [{"role": "user", "content": "<|image|>Detect the text in the bounding box <bbox>551,851,603,882</bbox>"}, {"role": "assistant", "content": "<ocr> into </ocr>"}], "task_name": "word_bbox2t_sft", "dataset_name": "VisualMRC", "id": "VisualMRC_bbox2t_679"}
+{"image": ["./imgs/VisualMRC/images/hdbuzz.net/__about__funding01.png"], "messages": [{"role": "user", "content": "<|image|>Identify the text within the bounding box <bbox>368,723,684,780</bbox>"}, {"role": "assistant", "content": "<ocr> special access to its content, </ocr>"}], "task_name": "phrase_bbox2t_sft", "dataset_name": "VisualMRC", "id": "VisualMRC_bbox2t_299"}
+{"image": ["./imgs/VisualMRC/images/en.wikiversity.org/wiki__Nuclease_mediated_genome_engineering_tools02.png"], "messages": [{"role": "user", "content": "<|image|>Detect the text in the bounding box <bbox>24,452,978,638</bbox>"}, {"role": "assistant", "content": "<ocr> Main characteristics of ZF domain originate from a zinc ion coordinated by two cystein residue, located on two antiparallel B-sheets, and two histidine \n residues on the a-helix. In nature, the ZF domain is primarily responsible for connection with DNA, each finger is capable of detecting a 3 nucleotide unit, \n called target triplet, by means of its a- helix, thus named also recognition helix. Linkage to DNA only occurs at single strand. A 3 finger ZF can detect 9 </ocr>"}], "task_name": "block_bbox2t_sft", "dataset_name": "VisualMRC", "id": "VisualMRC_bbox2t_251"}
--- a/DocLocal4K/text_grounding.jsonl
+++ b/DocLocal4K/text_grounding.jsonl
--- a/DocLocal4K/text_recognition.jsonl
+++ b/DocLocal4K/text_recognition.jsonl
--- a/README.md
+++ b/README.md
+# mPLUG-DocOwl 1.5
+mPLUG-DocOwl 1.5 是阿里巴巴mPLUG团队在多模态文档图片理解领域的最新开源工作，在10个文档理解benchmark上达到最优效果，5个数据集上提升超过10个点，部分数据集上超过智谱17.3B的CogAgent，在DocVQA上达到82.2的效果。
+## 论文
+- [mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895)
+## 模型结构
+DocOwl 1.5强调文档图片理解中对于“文档结构”理解的重要性，提出对于所有文字信息丰富的图片进行统一的结构学习。DocOwl 1.5延续该团队前序工作DocOwl以及UReader处理高分辨率文档图片的方式，采用一个形状适应的切图模块将高分辨率图片切为多个大小一致的子图。为了更好的将图片的文字布局信息传递给LLM，同时避免在处理高分辨率文档图片时视觉特征过长，DocOwl 1.5提出来一个基于卷积的连接结构H-Reducer，其在水平方向上混合4个视觉特征，模型结构如下图所示。
+<div align="center">
+    <img src="./assets/model_strcuture.png">
+</div>
+## 算法原理
+为了进行统一的文档结构学习，该工作基于开源数据集构建了一个全面的结构化解析数据集DocStruct4M。对于文档图片或者网页截图，主要采用空格和换行表示文字布局；对于表格，其改进的Markdown语法既能表示跨行跨列，又相比html缩减了大量标签；对于图表，同样采用markdown来表示其数学特征，并且限定数值的有效位以保证其在图片中视觉可见；对于自然图，采用描述加上ocr文本的形式。
+<div align=center>
+    <img src="./assets/model_theory.png">
+</div>
+## 环境配置
+### Docker（方法一）
+[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name mplug-doclocal <your imageID> bash
+cd /path/your_code_data/
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+```
+### Dockerfile（方法二）
+```
+cd /path/your_code_data/docker
+docker build --no-cache -t mplug-doclocal:latest .
+docker run --shm-size=64G --name mplug-doclocal -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v /path/your_code_data/:/path/your_code_data/ -it mplug-doclocal bash
+```
+### Anaconda（方法三）
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+DTK驱动：dtk24.04
+python：python3.10
+torch:2.1
+torchvision: 0.16.0
+deepspped: 0.12.3
+```
+`Tips：以上dtk驱动、python、paddle等DCU相关工具版本需要严格一一对应`
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+conda create -n mplug-doclocal python=3.10
+conda activate mplug-doclocal
+cd /path/your_code_data/
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple
+```
+## 数据集
+迷你数据集 [mini DocLocal4K](./DocLocal4K/mini_imges.jsonl) 
+完整[DocLocal4K](https://www.modelscope.cn/datasets/iic/DocLocal4K)数据集下载路径
+预训练需要准备你的训练数据，需要将所有样本放到一个列表中并存入json文件中。每个样本对应一个字典，包含以下信息，示例如下所示：用于正常训练的完整数据集请按此目录结构进行制备：
+```
+{"image": ["./imgs/DUE_Benchmark/DocVQA/pngs/xnbl0037_1.png"], "messages": [{"role": "user", "content": "<|image|>what is the date mentioned in this letter?"}, {"role": "assistant", "content": "1/8/93"}], "task_name": "qa_sft", "dataset_name": "DocVQA"}
+```
+## 训练
+数据集快速下载中心：[SCNet AIDatasets](http://113.200.138.88:18080/aidatasets)
+项目中的测试数据集可从快速下载通道下载 [VisualMRC](http://113.200.138.88:18080/aidatasets/visualmrc)
+根据实际情况在脚本中修相关路径
+--deepspeed
+--model_name_or_path
+--data_path
+--image_folder
+--output_dir
+### 单机多卡
+```
+sh finetune_docowl_lora_dcu.sh
+```
+## 推理
+### 单机单卡
+ mPLUG-DocOwl 1.5 对英文文档表现优异，中文后续优化。
+若出现以下报错，点击Regenerate后尝试：
+`NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.`
+### 指令问答
+```
+python docowl_infer.py
+```
+## result
+### 指令问答
+<div align=center>
+    <img src="./assets/result.jpg"/>
+</div>
+### 精度
+测试数据： [mini DocLocal4K](./DocLocal4K/mini_imges.jsonl)，使用的加速卡:K100/A800。
+| device | train_loss | 
+| :------: | :------: |  
+| K100*2 | 10.3234 |
+| A800*2 | 10.3188 |
+## 应用场景
+### 算法类别
+`OCR, 对话问答`
+### 热点应用行业
+`金融,教育,政府,交通`
+## 预训练权重
+- [iic/DocOwl1.5-Omni](https://www.modelscope.cn/models/iic/DocOwl1.5-Omni/)
+预训练权重快速下载中心：[SCNet AIModels](http://113.200.138.88:18080/aimodels)
+项目中的预训练权重可从快速下载通道下载： [DocOwl1.5-Omni](http://113.200.138.88:18080/aimodels/mplug-doclcal_1.5)
+## 源码仓库及问题反馈
+- http://developer.hpccube.com/codes/modelzoo/umt5.git
+## 参考资料
+- [iic/DocOwl1.5-Omni 魔搭](https://www.modelscope.cn/models/iic/DocOwl1.5-Omni/)
+- [mPLUG-DocOwl 1.5 github](https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl1.5)
--- a/README_mPLUG.md
+++ b/README_mPLUG.md
+<div align="center">
+<img src="assets/mPLUG_new1.png" width="80%">
+</div>
+<div align="center">
+<h2>The Powerful Multi-modal LLM Family
+for OCR-free Document Understanding<h2>
+<strong>Alibaba Group</strong>
+</div>
+<p align="center">
+<a href="https://trendshift.io/repositories/9061" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9061" alt="DocOwl | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+## 📢 News
+* 🔥🔥🔥 [2024.5.08] We have released the training code of [DocOwl1.5](./DocOwl1.5/) supported by DeepSpeed. You can now finetune a stronger model based on DocOwl1.5!
+* 🔥🔥🔥 [2024.4.26] We release the arxiv paper of [TinyChart](https://arxiv.org/abs/2404.16635), a SOTA 3B Multimodal LLM for Chart Understanding with Program-of-Throught ability (ChartQA: 83.6 > Gemin-Ultra 80.8 > GPT4V 78.5). The demo of TinyChart is available on [HuggingFace](https://huggingface.co/spaces/mPLUG/TinyChart-3B) 🤗. Both codes, models and data are released in [TinyChart](./TinyChart/).
+* 🔥🔥🔥 [2024.4.3] We build demos of DocOwl1.5 on both [ModelScope](https://modelscope.cn/studios/iic/mPLUG-DocOwl/) <img src="./assets/modelscope.png" width='20'> and [HuggingFace](https://huggingface.co/spaces/mPLUG/DocOwl) 🤗, supported by the DocOwl1.5-Omni. The source codes of launching a local demo are also released in [DocOwl1.5](./DocOwl1.5/).
+* 🔥🔥 [2024.3.28] We release the training data (DocStruct4M, DocDownstream-1.0, DocReason25K), codes and models (DocOwl1.5-stage1, DocOwl1.5, DocOwl1.5-Chat, DocOwl1.5-Omni) of [mPLUG-DocOwl 1.5](./DocOwl1.5/) on both **HuggingFace** 🤗 and **ModelScope** <img src="./assets/modelscope.png" width='20'>.
+* 🔥 [2024.3.20] We release the arxiv paper of [mPLUG-DocOwl 1.5](http://arxiv.org/abs/2403.12895), a SOTA 8B Multimodal LLM on OCR-free Document Understanding (DocVQA 82.2, InfoVQA 50.7, ChartQA 70.2, TextVQA 68.6).
+* [2024.01.13] Our Scientific Diagram Analysis dataset [M-Paper](https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/PaperOwl) has been available on both **HuggingFace** 🤗 and **ModelScope** <img src="./assets/modelscope.png" width='20'>, containing 447k high-resolution diagram images and corresponding paragraph analysis.
+* [2023.10.13] Training data, models of [mPLUG-DocOwl](./DocOwl/)/[UReader](./UReader/) has been open-sourced.
+* [2023.10.10] Our paper [UReader](https://arxiv.org/abs/2310.05126) is accepted by EMNLP 2023.
+<!-- * 🔥 [10.10] The source code and instruction data will be released in [UReader](https://github.com/LukeForeverYoung/UReader). -->
+* [2023.07.10] The demo of mPLUG-DocOwl on [ModelScope](https://modelscope.cn/studios/damo/mPLUG-DocOwl/summary) is avaliable.
+* [2023.07.07] We release the technical report and evaluation set of mPLUG-DocOwl.
+## 🤖 Models
+- [**mPLUG-DocOwl1.5**](./DocOwl1.5/) (Arxiv 2024) - mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding
+- [**TinyChart**](./TinyChart/) (Arxiv 2024) - TinyChart: Efficient Chart Understanding with
+Visual Token Merging and Program-of-Thoughts Learning
+- [**mPLUG-PaperOwl**](./PaperOwl/) (Arxiv 2023) - mPLUG-PaperOwl: Scientific Diagram Analysis with the Multimodal Large Language Model
+- [**UReader**](./UReader/) (EMNLP 2023) - UReader: Universal OCR-free Visually-situated Language Understanding with Multimodal Large Language Model
+- [**mPLUG-DocOwl**](./DocOwl/) (Arxiv 2023) - mPLUG-DocOwl: Modularized Multimodal Large Language Model for Document Understanding
+## 📺 Online Demo
+Note: The demo of HuggingFace is not as stable as ModelScope because the GPU in ZeroGPU Spaces of HuggingFace is dynamically assigned.
+### 📖 DocOwl 1.5
+- 🤗 [HuggingFace Space](https://huggingface.co/spaces/mPLUG/DocOwl)
+- <img src="assets/modelscope.png" width='20'> [ModelScope Space](https://modelscope.cn/studios/iic/mPLUG-DocOwl/) 
+### 📈 TinyChart-3B
+- 🤗 [HuggingFace Space](https://huggingface.co/spaces/mPLUG/TinyChart-3B)
+## 🌰 Cases
+![images](assets/docowl1.5_chat_case.png)
+## Related Projects
+* [mPLUG](https://github.com/alibaba/AliceMind/tree/main/mPLUG).
+* [mPLUG-2](https://github.com/alibaba/AliceMind).
+* [mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl)
--- a/__pycache__/model_worker.cpython-310.pyc
+++ b/__pycache__/model_worker.cpython-310.pyc
--- a/app.py
+++ b/app.py
+import argparse
+import datetime
+import json
+import os
+import time
+import gradio as gr
+import requests
+from mplug_docowl.conversation import (default_conversation, conv_templates,
+                                   SeparatorStyle)
+from mplug_docowl.constants import LOGDIR
+from mplug_docowl.utils import (build_logger, server_error_msg,
+    violates_moderation, moderation_msg)
+from model_worker import ModelWorker
+import hashlib
+from icecream import ic
+logger = build_logger("gradio_web_server_local", "gradio_web_server_local.log")
+headers = {"User-Agent": "mPLUG-DocOwl1.5 Client"}
+no_change_btn = gr.Button.update()
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+def load_demo(url_params, request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
+    state = default_conversation.copy()
+    return state
+def vote_last_response(state, vote_type, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "state": state.dict(),
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+def upvote_last_response(state, request: gr.Request):
+    logger.info(f"upvote. ip: {request.client.host}")
+    vote_last_response(state, "upvote", request)
+    return ("",) + (disable_btn,) * 3
+def downvote_last_response(state, request: gr.Request):
+    logger.info(f"downvote. ip: {request.client.host}")
+    vote_last_response(state, "downvote", request)
+    return ("",) + (disable_btn,) * 3
+def flag_last_response(state, request: gr.Request):
+    logger.info(f"flag. ip: {request.client.host}")
+    vote_last_response(state, "flag", request)
+    return ("",) + (disable_btn,) * 3
+def regenerate(state, image_process_mode, request: gr.Request):
+    logger.info(f"regenerate. ip: {request.client.host}")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+def add_text(state, text, image, image_process_mode, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+    if len(text) <= 0 and image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
+    if args.moderate:
+        flagged = violates_moderation(text)
+        if flagged:
+            state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
+                no_change_btn,) * 5
+    text = text[:3584]  # Hard cut-off
+    if image is not None:
+        text = text[:3500]  # Hard cut-off for images
+        if '<|image|>' not in text:
+            text = '<|image|>' + text
+        text = (text, image, image_process_mode)
+        if len(state.get_images(return_pil=True)) > 0:
+            state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+def http_bot(state, temperature, top_p, max_new_tokens, request: gr.Request):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        template_name = "mplug_owl2"
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    # Construct prompt
+    prompt = state.get_prompt()
+    all_images = state.get_images(return_pil=True)
+    # debug
+    """for image in all_images:
+        ic(image.size)"""
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    for image, hash in zip(all_images, all_image_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            image.save(filename)
+    # Make requests
+    pload = {
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 2048),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+    }
+    logger.info(f"==== request ====\n{pload}")
+    pload['images'] = state.get_images()
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+    try:
+        # Stream output
+        # response = requests.post(worker_addr + "/worker_generate_stream",
+        #     headers=headers, json=pload, stream=True, timeout=10)
+        # for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+        response = model.generate_stream_gate(pload)
+        # print('response:', response)
+        for chunk in response:
+            if chunk:
+                print('chunk:', chunk.decode())
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    output = data["text"][len(prompt):].strip()
+                    state.messages[-1][-1] = output + "▌"
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(finish_tstamp, 4),
+            "type": "chat",
+            "start": round(start_tstamp, 4),
+            "finish": round(start_tstamp, 4),
+            "state": state.dict(),
+            "images": all_image_hash,
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+title_markdown = ("""
+<h1 align="center"><a href="https://github.com/X-PLUG/mPLUG-DocOwl"><img src="https://github.com/X-PLUG/mPLUG-DocOwl/raw/main/assets/mPLUG_new1.png", alt="mPLUG-DocOwl" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1>
+<h2 align="center"> mPLUG-DocOwl1.5: Unified Stucture Learning for OCR-free Document Understanding</h2>
+<h5 align="center"> If you like our project, please give us a star ✨ on Github for latest update.  </h2>
+<h5 align="center"> Note: This demo is temporarily only supported for English Document Understanding. The Chinese-and-English model is under development.</h2>
+<h5 align="center"> 注意: 当前Demo只支持英文文档理解, 中英模型正在全力开发中。</h2>
+<h5 align="center"> Note: If you want a detailed explanation, please remember to add a prompot "Give a detailed explanation." after the question.</h2>
+<h5 align="center"> 注意: 如果你想要详细的推理解释, 请在问题后面加上“Give a detailed explanation.”。</h2>
+<div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        <a href='https://github.com/X-PLUG/mPLUG-DocOwl'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        <a href="https://arxiv.org/abs/2403.12895"><img src="https://img.shields.io/badge/Arxiv-2403.12895-red"></a>
+        <a href='https://github.com/X-PLUG/mPLUG-DocOwl/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/mPLUG-DocOwl.svg?style=social'></a>
+    </div>
+</div>
+""")
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+.bot {
+  white-space: break-spaces;
+}
+"""
+def build_demo(embed_mode):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+    with gr.Blocks(title="mPLUG-DocOwl1.5", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+        with gr.Row():
+            with gr.Column(scale=3):
+                imagebox = gr.Image(type="pil")
+                image_process_mode = gr.Radio(
+                    # ["Crop", "Resize", "Pad", "Default"],
+                    [],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+                with gr.Accordion("Parameters", open=True) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.1, interactive=True, label="Temperature",)
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(elem_id="Chatbot", label="mPLUG-DocOwl1.5 Chatbot", height=600)
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        textbox.render()
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(value="Send", variant="primary")
+                with gr.Row(elem_id="buttons") as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+        if not embed_mode:
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        url_params = gr.JSON(visible=False)
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(
+            upvote_last_response,
+            state,
+            [textbox, upvote_btn, downvote_btn, flag_btn],
+            queue=False
+        )
+        downvote_btn.click(
+            downvote_last_response,
+            state,
+            [textbox, upvote_btn, downvote_btn, flag_btn],
+            queue=False
+        )
+        flag_btn.click(
+            flag_last_response,
+            state,
+            [textbox, upvote_btn, downvote_btn, flag_btn],
+            queue=False
+        )
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list
+        )
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        )
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list
+        )
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, temperature, top_p, max_output_tokens],
+            [state, chatbot] + btn_list
+        )
+        demo.load(
+            load_demo,
+            [url_params],
+            state,
+            _js=get_window_url_params,
+            queue=False
+        )
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default="7879")
+    parser.add_argument("--concurrency-count", type=int, default=10)
+    parser.add_argument("--model-list-mode", type=str, default="once",
+        choices=["once", "reload"])
+    parser.add_argument("--model-source", type=str, default="modelscope",
+        choices=["local", "modelscope", "huggingface"])
+    parser.add_argument("--model-version", type=str, default="Omni", 
+        choices=['stage1', 'Chat','Omni'])
+    parser.add_argument("--model-path", type=str, default="iic/DocOwl1___5-Omni")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    if args.model_source == 'modelscope':
+        # download model from modelscope
+        from modelscope.hub.snapshot_download import snapshot_download
+        model_dir = snapshot_download('iic/DocOwl1.5-'+args.model_version, cache_dir='./')
+        args.model_path = 'iic/DocOwl1___5-'+args.model_version
+    elif args.model_source == 'huggingface':
+        # download model from huggingface
+        from huggingface_hub import snapshot_download
+        model_dir = snapshot_download('mPLUG/DocOwl1.5-'+args.model_version, cache_dir='./')
+        args.model_path = 'mPLUG/DocOwl1.5-'+args.model_version
+    print(os.listdir('./'))
+    model = ModelWorker(args.model_path, None, None, 
+            resolution=448, 
+            anchors='grid_9',
+            add_global_img=True,
+            load_8bit=args.load_8bit, 
+            load_4bit=args.load_4bit, 
+            device=args.device)
+    logger.info(args)
+    demo = build_demo(args.embed)
+    demo.queue(
+        concurrency_count=args.concurrency_count,
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=False
+    )
--- a/assets/Paper-Arxiv-orange.svg
+++ b/assets/Paper-Arxiv-orange.svg
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="80" height="20" role="img" aria-label="Paper: Arxiv"><title>Paper: Arxiv</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="80" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="39" height="20" fill="#fe7d37"/><rect width="80" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Paper</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Paper</text><text aria-hidden="true" x="595" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">Arxiv</text><text x="595" y="140" transform="scale(.1)" fill="#fff" textLength="290">Arxiv</text></g></svg>
\ No newline at end of file
--- a/assets/doc_instruct.png
+++ b/assets/doc_instruct.png
--- a/assets/model_strcuture.png
+++ b/assets/model_strcuture.png
--- a/assets/model_theory.png
+++ b/assets/model_theory.png
--- a/assets/modelscope.png
+++ b/assets/modelscope.png
--- a/assets/radar.png
+++ b/assets/radar.png
--- a/assets/result.jpg
+++ b/assets/result.jpg
--- a/demo_logs/model_worker_026d32.log
+++ b/demo_logs/model_worker_026d32.log
+2024-07-01 02:59:33 | ERROR | stderr | /usr/local/lib/python3.10/dist-packages/gradio/components/button.py:89: UserWarning: Using the update method is deprecated. Simply return a new object instead, e.g. `return gr.Button(...)` instead of `return gr.Button.update(...)`.
+2024-07-01 02:59:33 | ERROR | stderr |   warnings.warn(
+2024-07-01 02:59:33 | INFO | stdout | ['DocOwl1.5-Omni-base', 'assets', 'evaluation', 'scripts', 'README.md', 'model_worker.py', 'docowl_infer.py', 'requirements.txt', 'docowl_app_demo.sh', '__pycache__', 'mplug_docowl', 'app.py', 'docowl_doclocal4k_evaluate.py', 'docowl_benchmark_evaluate.py', 'demo_logs', 'image', 'DocLocal4K']
+2024-07-01 02:59:33 | INFO | model_worker | Loading the model DocOwl1.5-Omni-base on worker 026d32 ...
+2024-07-01 03:01:54 | ERROR | stderr | /usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+2024-07-01 03:01:54 | ERROR | stderr |   warnings.warn(
+2024-07-01 03:01:54 | ERROR | stderr | /usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+2024-07-01 03:01:54 | ERROR | stderr |   warnings.warn(
+2024-07-01 03:02:01 | INFO | stdout | Running on local URL:  http://0.0.0.0:7860
+2024-07-01 03:02:02 | INFO | stdout | 
+2024-07-01 03:02:02 | INFO | stdout | To create a public link, set `share=True` in `launch()`.
+2024-07-01 03:03:12 | ERROR | stderr | /usr/local/lib/python3.10/dist-packages/fastapi/routing.py:191: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'mode': 'static'}
+2024-07-01 03:03:12 | ERROR | stderr |   return await dependant.call(**values)
+2024-07-01 03:03:13 | ERROR | stderr | /usr/lib/python3.10/asyncio/runners.py:44: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'mode': 'static'}
+2024-07-01 03:03:13 | ERROR | stderr |   return loop.run_until_complete(main)
+2024-07-01 03:03:14 | ERROR | stderr | [38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mmax_context_length[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m4096[39m
+2024-07-01 03:03:14 | ERROR | stderr | [38;5;245m    [39m[38;5;247minput_ids[39m[38;5;245m.[39m[38;5;247mshape[39m[38;5;245m[[39m[38;5;245m-[39m[38;5;36m1[39m[38;5;245m][39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m97[39m
+2024-07-01 03:03:14 | ERROR | stderr | [38;5;245m    [39m[38;5;247mnum_image_tokens[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m1799.0[39m
+2024-07-01 03:03:14 | ERROR | stderr | [38;5;245m    [39m[38;5;247mmax_new_tokens[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m512[39m
+2024-07-01 03:03:14 | ERROR | stderr | /usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
+2024-07-01 03:03:14 | ERROR | stderr |   warnings.warn(
+2024-07-01 03:03:29 | INFO | stdout | Caught Unknown Error
+2024-07-01 03:03:29 | INFO | stdout | chunk: {"text": "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**", "error_code": 1}
+2024-07-01 03:03:29 | ERROR | stderr | /usr/lib/python3.10/asyncio/runners.py:44: GradioUnusedKwargWarning: You have unused kwarg parameters in Button, please remove them: {'mode': 'dynamic'}
+2024-07-01 03:03:29 | ERROR | stderr |   return loop.run_until_complete(main)
+2024-07-01 03:03:43 | INFO | stdout | Keyboard interruption in main thread... closing server.
+2024-07-01 03:03:43 | INFO | stdout | [0m
--- a/demo_logs/model_worker_0b4ed1.log
+++ b/demo_logs/model_worker_0b4ed1.log
+2024-07-01 00:36:30 | ERROR | stderr | Traceback (most recent call last):
+2024-07-01 00:36:30 | ERROR | stderr |   File "/home/wanglch/projects/DocOwl1.5-Omni/app.py", line 24, in <module>
+2024-07-01 00:36:30 | ERROR | stderr |     no_change_btn = gr.Button.update()
+2024-07-01 00:36:30 | ERROR | stderr | AttributeError: type object 'Button' has no attribute 'update'
--- a/demo_logs/model_worker_0d9745.log
+++ b/demo_logs/model_worker_0d9745.log
+2024-07-01 01:50:36 | ERROR | stderr | Traceback (most recent call last):
+2024-07-01 01:50:36 | ERROR | stderr |   File "/home/wanglch/projects/DocOwl1.5-Omni/app.py", line 24, in <module>
+2024-07-01 01:50:36 | ERROR | stderr |     no_change_btn = gr.Button.update()
+2024-07-01 01:50:36 | ERROR | stderr | AttributeError: type object 'Button' has no attribute 'update'