Initial commit

7a60e044 · wanglch · 7a60e044 · 7a60e044 · 7a60e044 · 7a60e044
Commit 7a60e044 authored Jun 11, 2024 by wanglch
20 changed files
--- a/.github/ISSUE_TEMPLATE/bug_report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
+name: 🐞 Bug
+description: 提交错误报告 | File a bug/issue
+title: "[BUG] <title>"
+labels: []
+
+body:
+  - type: checkboxes
+    attributes:
+      label: 是否已有关于该错误的issue或讨论？ | Is there an existing issue / discussion for this?
+      description: |
+        请先搜索您遇到的错误是否在已有的issues或讨论中提到过。
+        Please search to see if an issue / discussion already exists for the bug you encountered.
+        [Issues](https://github.com/OpenBMB/MiniCPM-V/issues)
+        [Discussions](https://github.com/OpenBMB/MiniCPM-V/discussions)
+      options:
+        - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions
+          required: true
+  - type: checkboxes
+    attributes:
+      label: 该问题是否在FAQ中有解答？ | Is there an existing answer for this in FAQ?
+      description: |
+        请先搜索您遇到的错误是否已在FAQ中有相关解答。
+        Please search to see if an answer already exists in FAQ for the bug you encountered.
+        [FAQ-en](https://github.com/OpenBMB/MiniCPM-V/blob/main/FAQ.md)
+        [FAQ-zh](https://github.com/OpenBMB/MiniCPM-V/blob/main/FAQ_zh.md)
+      options:
+        - label: 我已经搜索过FAQ | I have searched FAQ
+          required: true
+  - type: textarea
+    attributes:
+      label: 当前行为 | Current Behavior
+      description: |
+        准确描述遇到的行为。
+        A concise description of what you're experiencing.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 期望行为 | Expected Behavior
+      description: |
+        准确描述预期的行为。
+        A concise description of what you expected to happen.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 复现方法 | Steps To Reproduce
+      description: |
+        复现当前行为的详细步骤。
+        Steps to reproduce the behavior.
+      placeholder: |
+        1. In this environment...
+        2. With this config...
+        3. Run '...'
+        4. See error...
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 运行环境 | Environment
+      description: |
+        examples:
+          - **OS**: Ubuntu 20.04
+          - **Python**: 3.8
+          - **Transformers**: 4.31.0
+          - **PyTorch**: 2.0.1
+          - **CUDA**: 11.4
+      value: |
+        - OS:
+        - Python:
+        - Transformers:
+        - PyTorch:
+        - CUDA (`python -c 'import torch; print(torch.version.cuda)'`):
+      render: Markdown
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: 备注 | Anything else?
+      description: |
+        您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。
+        
+        您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。
+        
+        Links? References? Anything that will give us more context about the issue you are encountering!
+        
+        Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/config.yaml
+++ b/.github/ISSUE_TEMPLATE/config.yaml
+blank_issues_enabled: true
--- a/.github/ISSUE_TEMPLATE/feature_request.yaml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yaml
+name: "💡 Feature Request"
+description: 创建新功能请求 | Create a new ticket for a new feature request
+title: "💡 [REQUEST] - <title>"
+labels: [
+  "question"
+]
+body:
+  - type: input
+    id: start_date
+    attributes:
+      label: "起始日期 | Start Date"
+      description: |
+        起始开发日期
+        Start of development
+      placeholder: "month/day/year"
+    validations:
+      required: false
+  - type: textarea
+    id: implementation_pr
+    attributes:
+      label: "实现PR | Implementation PR"
+      description: |
+        实现该功能的Pull request
+        Pull request used
+      placeholder: "#Pull Request ID"
+    validations:
+      required: false
+  - type: textarea
+    id: reference_issues
+    attributes:
+      label: "相关Issues | Reference Issues"
+      description: |
+        与该功能相关的issues
+        Common issues
+      placeholder: "#Issues IDs"
+    validations:
+      required: false
+  - type: textarea
+    id: summary
+    attributes:
+      label: "摘要 | Summary"
+      description: |
+        简要描述新功能的特点
+        Provide a brief explanation of the feature
+      placeholder: |
+        Describe in a few lines your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: basic_example
+    attributes:
+      label: "基本示例 | Basic Example"
+      description: Indicate here some basic examples of your feature.
+      placeholder: A few specific words about your feature request.
+    validations:
+      required: true
+  - type: textarea
+    id: drawbacks
+    attributes:
+      label: "缺陷 | Drawbacks"
+      description: |
+        该新功能有哪些缺陷/可能造成哪些影响？
+        What are the drawbacks/impacts of your feature request ?
+      placeholder: |
+        Identify the drawbacks and impacts while being neutral on your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: unresolved_question
+    attributes:
+      label: "未解决问题 | Unresolved questions"
+      description: |
+        有哪些尚未解决的问题？
+        What questions still remain unresolved ?
+      placeholder: |
+        Identify any unresolved issues.
+    validations:
+      required: false
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
+*.bk
+__pycache__
+.DS_Store
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
+{
+    "githubPullRequests.ignoredPullRequestBranches": [
+        "main"
+    ]
+}
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 OpenBMB
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# MiniCPM-V
+
+**MiniCPM-V**是面向图文理解的端侧多模态大模型系列。该系列模型接受图像和文本输入，并提供高质量的文本输出。
+
+## 论文
+- [MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies](https://arxiv.org/abs/2404.06395)
+
+## 模型结构
+
+MiniCPM 是面壁智能与清华大学自然语言处理实验室共同开源的系列端侧大模型，主体语言模型 MiniCPM-2B 仅有 24亿（2.4B）的非词嵌入参数量, 总计2.7B参数量。
+
+<div align="center">
+    <img src="./assets/OIP-C.jpg"/>
+</div>
+
+
+## 算法原理
+
+该模型基于 MiniCPM 2.4B 和 SigLip-400M 构建，共拥有 2.8B 参数。MiniCPM-V 2.0 具有领先的光学字符识别（OCR）和多模态理解能力。该模型在综合性 OCR 能力评测基准 OCRBench 上达到开源模型最佳水平，甚至在场景文字理解方面实现接近 Gemini Pro 的性能。
+
+<div align=center>
+    <img src="./assets/transformer.png"/>
+</div>
+
+
+## 环境配置
+### Docker（方法一）
+[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤
+
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
+
+docker run -it -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro --shm-size=64G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name minicpm-v <your imageID> bash
+
+cd /path/your_code_data/
+
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+```
+
+### Dockerfile（方法二）
+```
+cd /path/your_code_data/docker
+
+docker build --no-cache -t minicpm-v:latest .
+
+docker run --shm-size=64G --name qwen-vl -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v /path/your_code_data/:/path/your_code_data/ -it minicpm-v bash
+```
+### Anaconda（方法三）
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+DTK驱动：dtk23.10
+python：python3.10
+torch:2.1
+torchvision: 0.16.0
+deepspped: 0.12.3
+```
+`Tips：以上dtk驱动、python、paddle等DCU相关工具版本需要严格一一对应`
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+conda create -n minicpm-v python=3.10
+
+conda activate minicpm-v
+
+cd /path/your_code_data/
+
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple
+```
+
+## 数据集
+
+迷你数据集 [self_build](data/self_build/) 
+
+本仓库提供自建数据集用于训练代码测试。预训练需要准备你的训练数据，需要将所有样本放到一个列表中并存入json文件中。每个样本对应一个字典，示例如下所示。用于正常训练的完整数据集请按此目录结构进行制备：
+
+```
+  [
+    {
+      "id": "0",
+      "image": 'path/to/image_0.jpg',
+      "conversations": [
+            {
+              'role': 'user', 
+              'content': '<image>\nHow many desserts are on the white plate?'
+            }, 
+            {
+                'role': 'assistant', 
+                'content': 'There are three desserts on the white plate.'
+            },   
+            {
+                'role': 'user', 
+                'content': 'What type of desserts are they?'
+            },
+            {
+                'role': 'assistant', 
+                'content': 'The desserts are cakes with bananas and pecans on top. They share similarities with donuts, but the presence of bananas and pecans differentiates them.'
+            }, 
+            {
+                'role': 'user', 
+                'content': 'What is the setting of the image?'}, 
+            {
+                'role': 'assistant', 
+                'content': 'The image is set on a table top with a plate containing the three desserts.'
+            },
+        ]
+    },
+  ]
+```
+
+## 训练
+训练需在[finetune_lora.sh](finetune_lora.sh)中修改以下参数
+```
+MODEL="openbmb/MiniCPM-Llama3-V-2_5" # or 修改为本地模型地址
+DATA="path/to/trainging_data" # 本地自定义训练集json文件
+EVAL_DATA="path/to/test_data" # 本地自定义验证集json文件
+
+--output_dir /home/wanglch/projects/saves/MiniCPM-Llama3-V-2_5/lora_train_dtk \
+--logging_dir /home/wanglch/projects/saves/MiniCPM-Llama3-V-2_5/lora_train_dtk \
+```
+### 多卡分布式训练
+
+```
+sh finetune_lora.sh
+```
+
+## 推理
+
+执行多种任务时需要对以下参数进行修改
+model_path = 'openbmb/MiniCPM-Llama3-V-2_5' # 修改为本地模型路径
+
+### 单机单卡
+
+```
+sh web_demo_2.5.sh
+```
+
+### 单机多卡
+
+```
+sh web_demo_2.5_multi.sh
+```
+
+## result
+
+### 银行汇票OCR
+
+<div align=center>
+    <img src="./assets/result_1.png"/>
+</div>
+
+### 承兑汇票OCR
+
+<div align=center>
+    <img src="./assets/result_2.png"/>
+</div>
+
+### 发票OCR
+
+<div align=center>
+    <img src="./assets/result_3.png"/>
+</div>
+
+
+
+### 精度
+测试数据： [self_build](data/self_build/)  ，使用的加速卡:A800/K100。
+
+| device | train_loss | eval_loss |
+| :------: | :------: | :------: |
+| A800 | 0.065 | 0.389 |  
+| K100 | 0.0635 | 0.391 | 
+
+## 应用场景
+
+### 算法类别
+`ocr`
+
+### 热点应用行业
+`金融,教育,政府,科研,制造,能源,交通`
+
+## 预训练权重
+
+- [openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)
+
+## 源码仓库及问题反馈
+- http://developer.hpccube.com/codes/modelzoo/umt5.git
+## 参考资料
+- [MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies](https://arxiv.org/abs/2404.06395)
+
+- [MiniCPM-V github](https://github.com/OpenBMB/MiniCPM-V?tab=readme-ov-file)
+
--- a/README_en.md
+++ b/README_en.md
--- a/README_zh_minicpm.md
+++ b/README_zh_minicpm.md
--- a/assets/dtk.png
+++ b/assets/dtk.png
--- a/assets/minicpm-llama-v-2-5_languages.md
+++ b/assets/minicpm-llama-v-2-5_languages.md
+- English
+- 中文
+- 한국어
+- 日本語
+- Deutsch
+- Français
+- Português
+- Español
+- မြန်မာဘာသာ
+- ไทย
+- Tiếng Việt
+- Türkçe
+- ܣܘܪܝܝܐ
+- العربية
+- हिन्दी
+- বাংলা
+- नेपाली
+- Türkmençe
+- Тоҷикӣ
+- Кыргызча
+- Русский
+- Українська
+- Беларуская
+- ქართული
+- Azərbaycanca
+- Հայերեն
+- Polski
+- Lietuvių
+- Eesti
+- Latviešu
+- Čeština
+- Slovenčina
+- Magyar
+- Slovenščina
+- Hrvatski
+- Bosanski
+- Crnogorski
+- Српски
+- Shqip
+- Română
+- Български
+- Македонски
+
+
+## 支持语言
+
+英语
+
+中文
+
+韩语
+
+日语
+
+德语
+
+法语
+
+葡萄牙语
+
+西班牙语
+
+缅甸语
+
+泰语
+
+越南语
+
+土耳其语
+
+叙利亚语
+
+阿拉伯语
+
+印地语
+
+孟加拉语
+
+尼泊尔语
+
+土库曼语
+
+塔吉克语
+
+吉尔吉斯语
+
+俄语
+
+乌克兰语
+
+白俄罗斯语
+
+格鲁吉亚语
+
+阿塞拜疆语
+
+亚美尼亚语
+
+波兰语
+
+立陶宛语
+
+爱沙尼亚语
+
+拉脱维亚语
+
+捷克语
+
+斯洛伐克语
+
+匈牙利语
+
+斯洛文尼亚语
+
+克罗地亚语
+
+波斯尼亚语
+
+黑山语
+
+塞尔维亚语
+
+阿尔巴尼亚语
+
+罗马尼亚语
+
+保加利亚
+
+马其顿语
+
+
+
+## Supported Languages
+
+English  
+Chinese  
+Korean  
+Japanese  
+German  
+French  
+Portuguese  
+Spanish  
+Burmese  
+Thai  
+Vietnamese  
+Turkish  
+Syriac  
+Arabic  
+Hindi  
+Bengali  
+Nepali  
+Turkmen  
+Tajik  
+Kyrgyz  
+Russian  
+Ukrainian  
+Belarusian  
+Georgian  
+Azerbaijani  
+Armenian  
+Polish  
+Lithuanian  
+Estonian  
+Latvian  
+Czech  
+Slovak  
+Hungarian  
+Slovenian  
+Croatian  
+Bosnian  
+Montenegrin  
+Serbian  
+Albanian  
+Romanian  
+Bulgarian  
+Macedonian
\ No newline at end of file
--- a/assets/omnilmm-12b-examples_2.pdf
+++ b/assets/omnilmm-12b-examples_2.pdf
--- a/assets/radar_omnilmm12b.png
+++ b/assets/radar_omnilmm12b.png
--- a/assets/result_1.png
+++ b/assets/result_1.png
--- a/assets/result_2.png
+++ b/assets/result_2.png
--- a/assets/result_3.png
+++ b/assets/result_3.png
--- a/assets/title-2.png
+++ b/assets/title-2.png
--- a/assets/transformer.jpg
+++ b/assets/transformer.jpg
--- a/assets/transformer.png
+++ b/assets/transformer.png
--- a/chat.py
+++ b/chat.py
+import os
+import torch
+import json
+from PIL import Image
+import base64
+import io
+from accelerate import load_checkpoint_and_dispatch, init_empty_weights
+from transformers import AutoTokenizer, AutoModel
+
+from omnilmm.utils import disable_torch_init
+from omnilmm.model.omnilmm import OmniLMMForCausalLM
+from omnilmm.model.utils import build_transform
+from omnilmm.train.train_utils import omni_preprocess
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+    
+
+def init_omni_lmm(model_path):
+    torch.backends.cuda.matmul.allow_tf32 = True
+    disable_torch_init()
+    model_name = os.path.expanduser(model_path)
+    print(f'Load omni_lmm model and tokenizer from {model_name}')
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, model_max_length=2048)
+
+    if False:
+        # model on multiple devices for small size gpu memory (Nvidia 3090 24G x2) 
+        with init_empty_weights():
+            model = OmniLMMForCausalLM.from_pretrained(model_name, tune_clip=True, torch_dtype=torch.bfloat16)
+        model = load_checkpoint_and_dispatch(model, model_name, dtype=torch.bfloat16, 
+                    device_map="auto",  no_split_module_classes=['Eva','MistralDecoderLayer', 'ModuleList', 'Resampler']
+        )
+    else:
+        model = OmniLMMForCausalLM.from_pretrained(
+            model_name, tune_clip=True, torch_dtype=torch.bfloat16
+        ).to(device='cuda', dtype=torch.bfloat16)
+
+    image_processor = build_transform(
+        is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP')
+
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    assert mm_use_im_start_end
+
+    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IM_END_TOKEN], special_tokens=True)
+
+
+    vision_config = model.model.vision_config
+    vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+    vision_config.use_im_start_end = mm_use_im_start_end
+    vision_config.im_start_token, vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+        [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+    image_token_len = model.model.config.num_query
+
+    return model, image_processor, image_token_len, tokenizer
+
+def expand_question_into_multimodal(question_text, image_token_len, im_st_token, im_ed_token, im_patch_token):
+    if '<image>' in question_text[0]['content']:
+        question_text[0]['content'] = question_text[0]['content'].replace(
+            '<image>', im_st_token + im_patch_token * image_token_len + im_ed_token)
+    else:
+        question_text[0]['content'] = im_st_token + im_patch_token * \
+            image_token_len + im_ed_token + '\n' + question_text[0]['content']
+    return question_text
+
+def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
+    question = expand_question_into_multimodal(
+        question, image_token_len, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN)
+
+    conversation = question
+    data_dict = omni_preprocess(sources=[conversation],
+                                  tokenizer=tokenizer,
+                                  generation=True)
+
+    data_dict = dict(input_ids=data_dict["input_ids"][0],
+                     labels=data_dict["labels"][0])
+    return data_dict
+
+
+
+class OmniLMM12B:
+    def __init__(self, model_path) -> None:
+        model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
+        self.model = model
+        self.image_token_len = image_token_len
+        self.image_transform = img_processor
+        self.tokenizer = tokenizer
+        self.model.eval()
+
+    def decode(self, image, input_ids):
+        with torch.inference_mode():
+            output = self.model.generate_vllm(
+                input_ids=input_ids.unsqueeze(0).cuda(),
+                images=image.unsqueeze(0).half().cuda(),
+                temperature=0.6,
+                max_new_tokens=1024,
+                # num_beams=num_beams,
+                do_sample=True,
+                output_scores=True,
+                return_dict_in_generate=True,
+                repetition_penalty=1.1,
+                top_k=30,
+                top_p=0.9,
+            )
+
+            response = self.tokenizer.decode(
+                output.sequences[0], skip_special_tokens=True)
+            response = response.strip()
+            return response
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        input_ids = wrap_question_for_omni_lmm(
+            msgs, self.image_token_len, self.tokenizer)['input_ids']
+        input_ids = torch.as_tensor(input_ids)
+        #print('input_ids', input_ids)
+        image = self.image_transform(image)
+
+        out = self.decode(image, input_ids)
+
+        return out
+        
+
+def img2base64(file_name):
+    with open(file_name, 'rb') as f:
+        encoded_string = base64.b64encode(f.read())
+        return encoded_string
+
+class MiniCPMV:
+    def __init__(self, model_path) -> None:
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model.eval().cuda()
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        
+        answer, context, _ = self.model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=self.tokenizer,
+            sampling=True,
+            temperature=0.7
+    	)
+        return answer
+
+class MiniCPMV2_5:
+    def __init__(self, model_path) -> None:
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float16)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model.eval().cuda()
+
+    def chat(self, input):
+        try:
+            image = Image.open(io.BytesIO(base64.b64decode(input['image']))).convert('RGB')
+        except Exception as e:
+            return "Image decode error"
+
+        msgs = json.loads(input['question'])
+        
+        answer = self.model.chat(
+            image=image,
+            msgs=msgs,
+            tokenizer=self.tokenizer,
+            sampling=True,
+            temperature=0.7
+    	)
+        return answer
+
+
+class MiniCPMVChat:
+    def __init__(self, model_path) -> None:
+        if '12B' in model_path:
+            self.model = OmniLMM12B(model_path)
+        elif 'MiniCPM-Llama3-V' in model_path:
+            self.model = MiniCPMV2_5(model_path)
+        else:
+            self.model = MiniCPMV(model_path)
+
+    def chat(self, input):
+        return self.model.chat(input)
+
+
+if __name__ == '__main__':
+    
+    model_path = 'openbmb/OmniLMM-12B'
+    chat_model = MiniCPMVChat(model_path)
+
+    im_64 = img2base64('./assets/worldmap_ck.jpg')
+
+    # first round chat 
+    msgs = [{"role": "user", "content": "What is interesting about this image?"}]
+    input = {"image": im_64, "question": json.dumps(msgs, ensure_ascii=True)}
+    answer = chat_model.chat(input)
+    print(msgs[-1]["content"]+'\n', answer)
+
+    # second round chat 
+    msgs.append({"role": "assistant", "content": answer})
+    msgs.append({"role": "user", "content": "Where is China in the image"})
+    input = {"image": im_64,"question": json.dumps(msgs, ensure_ascii=True)}
+    answer = chat_model.chat(input)
+    print(msgs[-1]["content"]+'\n', answer)
+