"...resnet50_tensorflow.git" did not exist on "09bc9f54fb7084b7908447572938b2e203d7c232"
Unverified Commit 919280aa authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge branch 'dev' into multi_gpu_v2

parents ea9336c0 c6881d83
...@@ -109,14 +109,11 @@ body: ...@@ -109,14 +109,11 @@ body:
- type: dropdown - type: dropdown
id: software_version id: software_version
attributes: attributes:
label: Software version | 软件版本 (magic-pdf --version) label: Software version | 软件版本 (mineru --version)
#multiple: false #multiple: false
options: options:
- -
- "1.0.x" - "2.0.x"
- "1.1.x"
- "1.2.x"
- "1.3.x"
validations: validations:
required: true required: true
......
This diff is collapsed.
This diff is collapsed.
...@@ -25,8 +25,8 @@ def do_parse( ...@@ -25,8 +25,8 @@ def do_parse(
p_lang_list: list[str], # List of languages for each PDF, default is 'ch' (Chinese) p_lang_list: list[str], # List of languages for each PDF, default is 'ch' (Chinese)
backend="pipeline", # The backend for parsing PDF, default is 'pipeline' backend="pipeline", # The backend for parsing PDF, default is 'pipeline'
parse_method="auto", # The method for parsing PDF, default is 'auto' parse_method="auto", # The method for parsing PDF, default is 'auto'
p_formula_enable=True, # Enable formula parsing formula_enable=True, # Enable formula parsing
p_table_enable=True, # Enable table parsing table_enable=True, # Enable table parsing
server_url=None, # Server URL for vlm-sglang-client backend server_url=None, # Server URL for vlm-sglang-client backend
f_draw_layout_bbox=True, # Whether to draw layout bounding boxes f_draw_layout_bbox=True, # Whether to draw layout bounding boxes
f_draw_span_bbox=True, # Whether to draw span bounding boxes f_draw_span_bbox=True, # Whether to draw span bounding boxes
...@@ -45,7 +45,7 @@ def do_parse( ...@@ -45,7 +45,7 @@ def do_parse(
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id) new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes_list[idx] = new_pdf_bytes pdf_bytes_list[idx] = new_pdf_bytes
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable) infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=formula_enable,table_enable=table_enable)
for idx, model_list in enumerate(infer_results): for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list) model_json = copy.deepcopy(model_list)
...@@ -57,7 +57,7 @@ def do_parse( ...@@ -57,7 +57,7 @@ def do_parse(
pdf_doc = all_pdf_docs[idx] pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx] _lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx] _ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable) middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, formula_enable)
pdf_info = middle_json["pdf_info"] pdf_info = middle_json["pdf_info"]
...@@ -169,8 +169,8 @@ def parse_doc( ...@@ -169,8 +169,8 @@ def parse_doc(
backend="pipeline", backend="pipeline",
method="auto", method="auto",
server_url=None, server_url=None,
start_page_id=0, # Start page ID for parsing, default is 0 start_page_id=0,
end_page_id=None # End page ID for parsing, default is None (parse all pages until the end of the document) end_page_id=None
): ):
""" """
Parameter description: Parameter description:
...@@ -192,6 +192,8 @@ def parse_doc( ...@@ -192,6 +192,8 @@ def parse_doc(
Without method specified, 'auto' will be used by default. Without method specified, 'auto' will be used by default.
Adapted only for the case where the backend is set to "pipeline". Adapted only for the case where the backend is set to "pipeline".
server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000` server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
start_page_id: Start page ID for parsing, default is 0
end_page_id: End page ID for parsing, default is None (parse all pages until the end of the document)
""" """
try: try:
file_name_list = [] file_name_list = []
......
# Use the official sglang image # Use the official sglang image
FROM lmsysorg/sglang:v0.4.7-cu124 FROM lmsysorg/sglang:v0.4.8.post1-cu126
# install mineru latest # install mineru latest
RUN python3 -m pip install -U 'mineru[core]' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages RUN python3 -m pip install -U 'mineru[core]' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages
......
# Documentation:
# https://docs.sglang.ai/backend/server_arguments.html#common-launch-commands
services: services:
mineru-sglang: mineru-sglang:
image: mineru-sglang:latest image: mineru-sglang:latest
...@@ -11,6 +13,10 @@ services: ...@@ -11,6 +13,10 @@ services:
command: command:
--host 0.0.0.0 --host 0.0.0.0
--port 30000 --port 30000
# --enable-torch-compile # You can also enable torch.compile to accelerate inference speed by approximately 15%
# --dp 2 # If you have more than two GPUs with 24GB VRAM or above, you can use sglang's multi-GPU parallel mode to increase throughput
# --tp 2 # If you have two GPUs with 12GB or 16GB VRAM, you can use the Tensor Parallel (TP) mode
# --mem-fraction-static 0.7 # If you have two GPUs with 11GB VRAM, in addition to Tensor Parallel mode, you need to reduce the KV cache size
ulimits: ulimits:
memlock: -1 memlock: -1
stack: 67108864 stack: 67108864
...@@ -23,4 +29,4 @@ services: ...@@ -23,4 +29,4 @@ services:
devices: devices:
- driver: nvidia - driver: nvidia
device_ids: ["0"] device_ids: ["0"]
capabilities: [gpu] capabilities: [gpu]
\ No newline at end of file
# Use the official sglang image # Use the official sglang image
FROM lmsysorg/sglang:v0.4.7-cu124 FROM lmsysorg/sglang:v0.4.8.post1-cu126
# install mineru latest # install mineru latest
RUN python3 -m pip install -U 'mineru[core]' --break-system-packages RUN python3 -m pip install -U 'mineru[core]' --break-system-packages
......
# Frequently Asked Questions # Frequently Asked Questions
### 1. When using the command `pip install magic-pdf[full]` on newer versions of macOS, the error `zsh: no matches found: magic-pdf[full]` occurs. ### 1. Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2
On macOS, the default shell has switched from Bash to Z shell, which has special handling logic for certain types of string matching. This can lead to the "no matches found" error. You can try disabling the globbing feature in the command line and then run the installation command again.
```bash
setopt no_nomatch
pip install magic-pdf[full]
```
### 2. Encountering the error `pickle.UnpicklingError: invalid load key, 'v'.` during use
This might be due to an incomplete download of the model file. You can try re-downloading the model file and then try again.
Reference: https://github.com/opendatalab/MinerU/issues/143
### 3. Where should the model files be downloaded and how should the `/models-dir` configuration be set?
The path for the model files is configured in "magic-pdf.json". just like:
```json
{
"models-dir": "/tmp/models"
}
```
This path is an absolute path, not a relative path. You can obtain the absolute path in the models directory using the "pwd" command.
Reference: https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
### 4. Encountered the error `ImportError: libGL.so.1: cannot open shared object file: No such file or directory` in Ubuntu 22.04 on WSL2
The `libgl` library is missing in Ubuntu 22.04 on WSL2. You can install the `libgl` library with the following command to resolve the issue: The `libgl` library is missing in Ubuntu 22.04 on WSL2. You can install the `libgl` library with the following command to resolve the issue:
...@@ -37,59 +10,14 @@ sudo apt-get install libgl1-mesa-glx ...@@ -37,59 +10,14 @@ sudo apt-get install libgl1-mesa-glx
Reference: https://github.com/opendatalab/MinerU/issues/388 Reference: https://github.com/opendatalab/MinerU/issues/388
### 5. Encountered error `ModuleNotFoundError: No module named 'fairscale'`
You need to uninstall the module and reinstall it:
```bash
pip uninstall fairscale
pip install fairscale
```
Reference: https://github.com/opendatalab/MinerU/issues/411
### 6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
The compatibility of cuda11 with new graphics cards is poor, and the CUDA version used by Paddle needs to be upgraded. ### 2. Error when installing MinerU on CentOS 7 or Ubuntu 18: `ERROR: Failed building wheel for simsimd`
```bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
```
Reference: https://github.com/opendatalab/MinerU/issues/558
### 7. On some Linux servers, the program immediately reports an error `Illegal instruction (core dumped)`
This might be because the server's CPU does not support the AVX/AVX2 instruction set, or the CPU itself supports it but has been disabled by the system administrator. You can try contacting the system administrator to remove the restriction or change to a different server.
References: https://github.com/opendatalab/MinerU/issues/591 , https://github.com/opendatalab/MinerU/issues/736
### 8. Error when installing MinerU on CentOS 7 or Ubuntu 18: `ERROR: Failed building wheel for simsimd`
The new version of albumentations (1.4.21) introduces a dependency on simsimd. Since the pre-built package of simsimd for Linux requires a glibc version greater than or equal to 2.28, this causes installation issues on some Linux distributions released before 2019. You can resolve this issue by using the following command: The new version of albumentations (1.4.21) introduces a dependency on simsimd. Since the pre-built package of simsimd for Linux requires a glibc version greater than or equal to 2.28, this causes installation issues on some Linux distributions released before 2019. You can resolve this issue by using the following command:
``` ```
pip install -U magic-pdf[full,old_linux] --extra-index-url https://wheels.myhloli.com conda create -n mineru python=3.11 -y
conda activate mineru
pip install -U "mineru[pipeline_old_linux]"
``` ```
Reference: https://github.com/opendatalab/MinerU/issues/1004 Reference: https://github.com/opendatalab/MinerU/issues/1004
### 9. Old Graphics Cards Such as M40 Encounter "RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED"
An error occurs during operation (cuda):
```
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasGemmStridedBatchedEx(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF, (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, (int)num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
```
Because BF16 precision is not supported on graphics cards before the Turing architecture and some graphics cards are not recognized by torch, it is necessary to manually disable BF16 precision.
Modify the code in lines 287-290 of the "pdf_parse_union_core_v2.py" file (note that the location may vary in different versions):
```
if torch.cuda.is_bf16_supported():
supports_bfloat16 = True
else:
supports_bfloat16 = False
```
Change it to:
```
supports_bfloat16 = False
```
Reference: https://github.com/opendatalab/MinerU/issues/1508
\ No newline at end of file
# 常见问题解答 # 常见问题解答
### 1.在较新版本的mac上使用命令安装pip install magic-pdf\[full\] zsh: no matches found: magic-pdf\[full\] ### 1.在WSL2的Ubuntu22.04中遇到报错`ImportError: libGL.so.1: cannot open shared object file: No such file or directory`
在 macOS 上,默认的 shell 从 Bash 切换到了 Z shell,而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑,这可能导致no matches found错误。
可以通过在命令行禁用globbing特性,再尝试运行安装命令
```bash
setopt no_nomatch
pip install magic-pdf[full]
```
### 2.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误
可能是由于模型文件未下载完整导致,可尝试重新下载模型文件后再试
参考:https://github.com/opendatalab/MinerU/issues/143
### 3.模型文件应该下载到哪里/models-dir的配置应该怎么填
模型文件的路径输入是在"magic-pdf.json"中通过
```json
{
"models-dir": "/tmp/models"
}
```
进行配置的。
这个路径是绝对路径而不是相对路径,绝对路径的获取可在models目录中通过命令 "pwd" 获取。
参考:https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
### 4.在WSL2的Ubuntu22.04中遇到报错`ImportError: libGL.so.1: cannot open shared object file: No such file or directory`
WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解决: WSL2的Ubuntu22.04中缺少`libgl`库,可通过以下命令安装`libgl`库解决:
...@@ -39,59 +10,14 @@ sudo apt-get install libgl1-mesa-glx ...@@ -39,59 +10,14 @@ sudo apt-get install libgl1-mesa-glx
参考:https://github.com/opendatalab/MinerU/issues/388 参考:https://github.com/opendatalab/MinerU/issues/388
### 5.遇到报错 `ModuleNotFoundError : Nomodulenamed 'fairscale'`
需要卸载该模块并重新安装
```bash
pip uninstall fairscale
pip install fairscale
```
参考:https://github.com/opendatalab/MinerU/issues/411
### 6.在部分较新的设备如H100上,使用CUDA加速OCR时解析出的文字乱码。 ### 2.在 CentOS 7 或 Ubuntu 18 系统安装MinerU时报错`ERROR: Failed building wheel for simsimd`
cuda11对新显卡的兼容性不好,需要升级paddle使用的cuda版本
```bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
```
参考:https://github.com/opendatalab/MinerU/issues/558
### 7.在部分Linux服务器上,程序一运行就报错 `非法指令 (核心已转储)` 或 `Illegal instruction (core dumped)`
可能是因为服务器CPU不支持AVX/AVX2指令集,或cpu本身支持但被运维禁用了,可以尝试联系运维解除限制或更换服务器。
参考:https://github.com/opendatalab/MinerU/issues/591 , https://github.com/opendatalab/MinerU/issues/736
### 8.在 CentOS 7 或 Ubuntu 18 系统安装MinerU时报错`ERROR: Failed building wheel for simsimd`
新版本albumentations(1.4.21)引入了依赖simsimd,由于simsimd在linux的预编译包要求glibc的版本大于等于2.28,导致部分2019年之前发布的Linux发行版无法正常安装,可通过如下命令安装: 新版本albumentations(1.4.21)引入了依赖simsimd,由于simsimd在linux的预编译包要求glibc的版本大于等于2.28,导致部分2019年之前发布的Linux发行版无法正常安装,可通过如下命令安装:
``` ```
pip install -U magic-pdf[full,old_linux] --extra-index-url https://wheels.myhloli.com conda create -n mineru python=3.11 -y
conda activate mineru
pip install -U "mineru[pipeline_old_linux]"
``` ```
参考:https://github.com/opendatalab/MinerU/issues/1004 参考:https://github.com/opendatalab/MinerU/issues/1004
### 9. 旧显卡如M40出现 "RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED"
在运行过程中(使用CUDA)出现以下错误:
```
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasGemmStridedBatchedEx(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF, (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, (int)num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
```
由于Turing架构之前的显卡不支持BF16精度,并且部分显卡未能被PyTorch正确识别,因此需要手动关闭BF16精度。
请找到并修改`pdf_parse_union_core_v2.py`文件中的第287至290行代码(注意:不同版本中位置可能有所不同),原代码如下:
```python
if torch.cuda.is_bf16_supported():
supports_bfloat16 = True
else:
supports_bfloat16 = False
```
将其修改为:
```python
supports_bfloat16 = False
```
参考:https://github.com/opendatalab/MinerU/issues/1508
docs/images/layout_example.png

559 KB | W: | H:

docs/images/layout_example.png

626 KB | W: | H:

docs/images/layout_example.png
docs/images/layout_example.png
docs/images/layout_example.png
docs/images/layout_example.png
  • 2-up
  • Swipe
  • Onion skin
## Overview ## Overview
After executing the `magic-pdf` command, in addition to outputting files related to markdown, several other files unrelated to markdown will also be generated. These files will be introduced one by one. After executing the `mineru` command, in addition to outputting files related to markdown, several other files unrelated to markdown will also be generated. These files will be introduced one by one.
### some_pdf_layout.pdf ### some_pdf_layout.pdf
Each page layout consists of one or more boxes. The number at the top left of each box indicates its sequence number. Additionally, in `layout.pdf`, different content blocks are highlighted with different background colors. Each page's layout consists of one or more bounding boxes. The number in the top-right corner of each box indicates the reading order. Additionally, different content blocks are highlighted with distinct background colors within the layout.pdf.
![layout example](images/layout_example.png) ![layout example](images/layout_example.png)
### some_pdf_spans.pdf ### some_pdf_spans.pdf(Applicable only to the pipeline backend)
All spans on the page are drawn with different colored line frames according to the span type. This file can be used for quality control, allowing for quick identification of issues such as missing text or unrecognized inline formulas. All spans on the page are drawn with different colored line frames according to the span type. This file can be used for quality control, allowing for quick identification of issues such as missing text or unrecognized inline formulas.
![spans example](images/spans_example.png) ![spans example](images/spans_example.png)
### some_pdf_model.json ### some_pdf_model.json(Applicable only to the pipeline backend)
#### Structure Definition #### Structure Definition
...@@ -117,13 +116,39 @@ The format of the poly coordinates is \[x0, y0, x1, y1, x2, y2, x3, y3\], repres ...@@ -117,13 +116,39 @@ The format of the poly coordinates is \[x0, y0, x1, y1, x2, y2, x3, y3\], repres
] ]
``` ```
### some_pdf_model_output.txt (Applicable only to the VLM backend)
This file contains the output of the VLM model, with each page's output separated by `----`.
Each page's output consists of text blocks starting with `<|box_start|>` and ending with `<|md_end|>`.
The meaning of each field is as follows:
- `<|box_start|>x0 y0 x1 y1<|box_end|>`
x0 y0 x1 y1 represent the coordinates of a quadrilateral, indicating the top-left and bottom-right points. The values are based on a normalized page size of 1000x1000.
- `<|ref_start|>type<|ref_end|>`
`type` indicates the block type. Possible values are:
```json
{
"text": "Text",
"title": "Title",
"image": "Image",
"image_caption": "Image Caption",
"image_footnote": "Image Footnote",
"table": "Table",
"table_caption": "Table Caption",
"table_footnote": "Table Footnote",
"equation": "Interline Equation"
}
```
- `<|md_start|>Markdown content<|md_end|>`
This field contains the Markdown content of the block. If `type` is `text`, the end of the text may contain the `<|txt_contd|>` tag, indicating that this block can be connected with the following `text` block(s).
If `type` is `table`, the content is in `otsl` format and needs to be converted into HTML for rendering in Markdown.
### some_pdf_middle.json ### some_pdf_middle.json
| Field Name | Description | | Field Name | Description |
| :------------- | :------------------------------------------------------------------------------------------------------------- | |:---------------| :------------------------------------------------------------------------------------------------------------- |
| pdf_info | list, each element is a dict representing the parsing result of each PDF page, see the table below for details | | pdf_info | list, each element is a dict representing the parsing result of each PDF page, see the table below for details |
| \_parse_type | ocr \| txt, used to indicate the mode used in this intermediate parsing state | | \_backend | pipeline \| vlm, used to indicate the mode used in this intermediate parsing state |
| \_version_name | string, indicates the version of magic-pdf used in this parsing | | \_version_name | string, indicates the version of mineru used in this parsing |
<br> <br>
...@@ -324,7 +349,92 @@ First-level block (if any) -> Second-level block -> Line -> Span ...@@ -324,7 +349,92 @@ First-level block (if any) -> Second-level block -> Line -> Span
] ]
} }
], ],
"_parse_type": "txt", "_backend": "pipeline",
"_version_name": "0.6.1" "_version_name": "0.6.1"
} }
``` ```
### some_pdf_content_list.json
This file is a JSON array where each element is a dict storing all readable content blocks in the document in reading order.
`content_list` can be viewed as a simplified version of `middle.json`. The content block types are mostly consistent with those in `middle.json`, but layout information is not included.
The content has the following types:
| type | desc |
|:---------|:--------------|
| image | Image |
| table | Table |
| text | Text / Title |
| equation | Block formula |
Please note that both `title` and text blocks in `content_list` are uniformly represented using the text type. The `text_level` field is used to distinguish the hierarchy of text blocks:
- A block without the `text_level` field or with `text_level=0` represents body text.
- A block with `text_level=1` represents a level-1 heading.
- A block with `text_level=2` represents a level-2 heading, and so on.
Each content contains the `page_idx` field, indicating the page number (starting from 0) where the content block resides.
#### example
```json
[
{
"type": "text",
"text": "The response of flow duration curves to afforestation ",
"text_level": 1,
"page_idx": 0
},
{
"type": "text",
"text": "Received 1 October 2003; revised 22 December 2004; accepted 3 January 2005 ",
"page_idx": 0
},
{
"type": "text",
"text": "Abstract ",
"text_level": 2,
"page_idx": 0
},
{
"type": "text",
"text": "The hydrologic effect of replacing pasture or other short crops with trees is reasonably well understood on a mean annual basis. The impact on flow regime, as described by the annual flow duration curve (FDC) is less certain. A method to assess the impact of plantation establishment on FDCs was developed. The starting point for the analyses was the assumption that rainfall and vegetation age are the principal drivers of evapotranspiration. A key objective was to remove the variability in the rainfall signal, leaving changes in streamflow solely attributable to the evapotranspiration of the plantation. A method was developed to (1) fit a model to the observed annual time series of FDC percentiles; i.e. 10th percentile for each year of record with annual rainfall and plantation age as parameters, (2) replace the annual rainfall variation with the long term mean to obtain climate adjusted FDCs, and (3) quantify changes in FDC percentiles as plantations age. Data from 10 catchments from Australia, South Africa and New Zealand were used. The model was able to represent flow variation for the majority of percentiles at eight of the 10 catchments, particularly for the 10–50th percentiles. The adjusted FDCs revealed variable patterns in flow reductions with two types of responses (groups) being identified. Group 1 catchments show a substantial increase in the number of zero flow days, with low flows being more affected than high flows. Group 2 catchments show a more uniform reduction in flows across all percentiles. The differences may be partly explained by storage characteristics. The modelled flow reductions were in accord with published results of paired catchment experiments. An additional analysis was performed to characterise the impact of afforestation on the number of zero flow days $( N _ { \\mathrm { z e r o } } )$ for the catchments in group 1. This model performed particularly well, and when adjusted for climate, indicated a significant increase in $N _ { \\mathrm { z e r o } }$ . The zero flow day method could be used to determine change in the occurrence of any given flow in response to afforestation. The methods used in this study proved satisfactory in removing the rainfall variability, and have added useful insight into the hydrologic impacts of plantation establishment. This approach provides a methodology for understanding catchment response to afforestation, where paired catchment data is not available. ",
"page_idx": 0
},
{
"type": "text",
"text": "1. Introduction ",
"text_level": 2,
"page_idx": 1
},
{
"type": "image",
"img_path": "images/a8ecda1c69b27e4f79fce1589175a9d721cbdc1cf78b4cc06a015f3746f6b9d8.jpg",
"img_caption": [
"Fig. 1. Annual flow duration curves of daily flows from Pine Creek, Australia, 1989–2000. "
],
"img_footnote": [],
"page_idx": 1
},
{
"type": "equation",
"img_path": "images/181ea56ef185060d04bf4e274685f3e072e922e7b839f093d482c29bf89b71e8.jpg",
"text": "$$\nQ _ { \\% } = f ( P ) + g ( T )\n$$",
"text_format": "latex",
"page_idx": 2
},
{
"type": "table",
"img_path": "images/e3cb413394a475e555807ffdad913435940ec637873d673ee1b039e3bc3496d0.jpg",
"table_caption": [
"Table 2 Significance of the rainfall and time terms "
],
"table_footnote": [
"indicates that the rainfall term was significant at the $5 \\%$ level, $T$ indicates that the time term was significant at the $5 \\%$ level, \\* represents significance at the $10 \\%$ level, and na denotes too few data points for meaningful analysis. "
],
"table_body": "<html><body><table><tr><td rowspan=\"2\">Site</td><td colspan=\"10\">Percentile</td></tr><tr><td>10</td><td>20</td><td>30</td><td>40</td><td>50</td><td>60</td><td>70</td><td>80</td><td>90</td><td>100</td></tr><tr><td>Traralgon Ck</td><td>P</td><td>P,*</td><td>P</td><td>P</td><td>P,</td><td>P,</td><td>P,</td><td>P,</td><td>P</td><td>P</td></tr><tr><td>Redhill</td><td>P,T</td><td>P,T</td><td>,*</td><td>**</td><td>P.T</td><td>P,*</td><td>P*</td><td>P*</td><td>*</td><td>,*</td></tr><tr><td>Pine Ck</td><td></td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>T</td><td>T</td><td>na</td><td>na</td></tr><tr><td>Stewarts Ck 5</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P.T</td><td>P.T</td><td>P,T</td><td>na</td><td>na</td><td>na</td></tr><tr><td>Glendhu 2</td><td>P</td><td>P,T</td><td>P,*</td><td>P,T</td><td>P.T</td><td>P,ns</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td></tr><tr><td>Cathedral Peak 2</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>*,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td></tr><tr><td>Cathedral Peak 3</td><td>P.T</td><td>P.T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td></tr><tr><td>Lambrechtsbos A</td><td>P,T</td><td>P</td><td>P</td><td>P,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>T</td></tr><tr><td>Lambrechtsbos B</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>T</td></tr><tr><td>Biesievlei</td><td>P,T</td><td>P.T</td><td>P,T</td><td>P,T</td><td>*,T</td><td>*,T</td><td>T</td><td>T</td><td>P,T</td><td>P,T</td></tr></table></body></html>",
"page_idx": 5
}
]
```
\ No newline at end of file
## 概览 ## 概览
`magic-pdf` 命令执行后除了输出 markdown 有关的文件以外,还会生成若干个和 markdown 无关的文件。现在将一一介绍这些文件 `mineru` 命令执行后除了输出 markdown 文件以外,还可能会生成若干个和 markdown 无关的文件。现在将一一介绍这些文件
### some_pdf_layout.pdf ### some_pdf_layout.pdf
每一页的 layout 均由一个或多个框组成。 每个框左上脚的数字表明它们的序。此外 layout.pdf 框内用不同的背景色块圈定不同的内容块。 每一页的 layout 均由一个或多个框组成。 每个框右上角的数字表明它们的阅读顺序。此外 layout.pdf 框内用不同的背景色块圈定不同的内容块。
![layout 页面示例](images/layout_example.png) ![layout 页面示例](images/layout_example.png)
### some_pdf_spans.pdf ### some_pdf_spans.pdf(仅适用于pipeline后端)
根据 span 类型的不同,采用不同颜色线框绘制页面上所有 span。该文件可以用于质检,可以快速排查出文本丢失、行公式未识别等问题。 根据 span 类型的不同,采用不同颜色线框绘制页面上所有 span。该文件可以用于质检,可以快速排查出文本丢失、行公式未识别等问题。
![span 页面示例](images/spans_example.png) ![span 页面示例](images/spans_example.png)
### some_pdf_model.json ### some_pdf_model.json(仅适用于pipeline后端)
#### 结构定义 #### 结构定义
...@@ -117,13 +117,39 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、 ...@@ -117,13 +117,39 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
] ]
``` ```
### some_pdf_model_output.txt(仅适用于vlm后端)
该文件是vlm模型的输出结果,使用`----`分割每一页的输出结果。
每一页的输出结果一些以`<|box_start|>`开头,`<|md_end|>`结尾的文本块。
其中字段的含义:
- `<|box_start|>x0 y0 x1 y1<|box_end|>`
其中x0 y0 x1 y1是四边形的坐标,分别表示左上、右下的两点坐标,值为将页面缩放至1000x1000后,四边形的坐标值。
- `<|ref_start|>type<|ref_end|>`
type是该block的类型,可能的值有:
```json
{
"text": "文本",
"title": "标题",
"image": "图片",
"image_caption": "图片描述",
"image_footnote": "图片脚注",
"table": "表格",
"table_caption": "表格描述",
"table_footnote": "表格脚注",
"equation": "行间公式"
}
```
- `<|md_start|>markdown内容<|md_end|>`
该字段是该block的markdown内容,如type为text,文本末尾可能存在`<|txt_contd|>`标记,表示该文本块可以后后续text块连接。
如type为table,内容为`otsl`格式表示的表格内容,需要转换为html格式才能在markdown中渲染。
### some_pdf_middle.json ### some_pdf_middle.json
| 字段名 | 解释 | | 字段名 | 解释 |
| :------------- | :----------------------------------------------------------------- | |:---------------|:------------------------------------------|
| pdf_info | list,每个元素都是一个dict,这个dict是每一页pdf的解析结果,详见下表 | | pdf_info | list,每个元素都是一个dict,这个dict是每一页pdf的解析结果,详见下表 |
| \_parse_type | ocr \| txt,用来标识本次解析的中间态使用的模式 | | \_backend | pipeline \| vlm,用来标识本次解析的中间态使用的模式 |
| \_version_name | string, 表示本次解析使用的 magic-pdf 的版本号 | | \_version_name | string, 表示本次解析使用的 mineru 的版本号 |
<br> <br>
...@@ -323,7 +349,86 @@ para_blocks内存储的元素为区块信息 ...@@ -323,7 +349,86 @@ para_blocks内存储的元素为区块信息
] ]
} }
], ],
"_parse_type": "txt", "_backend": "pipeline",
"_version_name": "0.6.1" "_version_name": "0.6.1"
} }
``` ```
### some_pdf_content_list.json
该文件是一个json数组,每个元素是一个dict,按阅读顺序平铺存储文档中所有可阅读的内容块。
content_list可以看成简化后的middle.json,内容块的类型基本和middle.json一致,但不包含布局信息。
content的类型有如下几种:
| type | desc |
|:---------|:------|
| image | 图片 |
| table | 表格 |
| text | 文本/标题 |
| equation | 行间公式 |
需要注意的是,content_list中的title和text块统一使用text类型表示,通过`text_level`字段来区分文本块的层级,不含`text_level`字段或`text_level`为0的文本块表示正文文本,`text_level`为1的文本块表示一级标题,`text_level`为2的文本块表示二级标题,以此类推。
每个content包含`page_idx`字段,表示该内容块所在的页码,从0开始。
#### 示例数据
```json
[
{
"type": "text",
"text": "The response of flow duration curves to afforestation ",
"text_level": 1,
"page_idx": 0
},
{
"type": "text",
"text": "Received 1 October 2003; revised 22 December 2004; accepted 3 January 2005 ",
"page_idx": 0
},
{
"type": "text",
"text": "Abstract ",
"text_level": 2,
"page_idx": 0
},
{
"type": "text",
"text": "The hydrologic effect of replacing pasture or other short crops with trees is reasonably well understood on a mean annual basis. The impact on flow regime, as described by the annual flow duration curve (FDC) is less certain. A method to assess the impact of plantation establishment on FDCs was developed. The starting point for the analyses was the assumption that rainfall and vegetation age are the principal drivers of evapotranspiration. A key objective was to remove the variability in the rainfall signal, leaving changes in streamflow solely attributable to the evapotranspiration of the plantation. A method was developed to (1) fit a model to the observed annual time series of FDC percentiles; i.e. 10th percentile for each year of record with annual rainfall and plantation age as parameters, (2) replace the annual rainfall variation with the long term mean to obtain climate adjusted FDCs, and (3) quantify changes in FDC percentiles as plantations age. Data from 10 catchments from Australia, South Africa and New Zealand were used. The model was able to represent flow variation for the majority of percentiles at eight of the 10 catchments, particularly for the 10–50th percentiles. The adjusted FDCs revealed variable patterns in flow reductions with two types of responses (groups) being identified. Group 1 catchments show a substantial increase in the number of zero flow days, with low flows being more affected than high flows. Group 2 catchments show a more uniform reduction in flows across all percentiles. The differences may be partly explained by storage characteristics. The modelled flow reductions were in accord with published results of paired catchment experiments. An additional analysis was performed to characterise the impact of afforestation on the number of zero flow days $( N _ { \\mathrm { z e r o } } )$ for the catchments in group 1. This model performed particularly well, and when adjusted for climate, indicated a significant increase in $N _ { \\mathrm { z e r o } }$ . The zero flow day method could be used to determine change in the occurrence of any given flow in response to afforestation. The methods used in this study proved satisfactory in removing the rainfall variability, and have added useful insight into the hydrologic impacts of plantation establishment. This approach provides a methodology for understanding catchment response to afforestation, where paired catchment data is not available. ",
"page_idx": 0
},
{
"type": "text",
"text": "1. Introduction ",
"text_level": 2,
"page_idx": 1
},
{
"type": "image",
"img_path": "images/a8ecda1c69b27e4f79fce1589175a9d721cbdc1cf78b4cc06a015f3746f6b9d8.jpg",
"img_caption": [
"Fig. 1. Annual flow duration curves of daily flows from Pine Creek, Australia, 1989–2000. "
],
"img_footnote": [],
"page_idx": 1
},
{
"type": "equation",
"img_path": "images/181ea56ef185060d04bf4e274685f3e072e922e7b839f093d482c29bf89b71e8.jpg",
"text": "$$\nQ _ { \\% } = f ( P ) + g ( T )\n$$",
"text_format": "latex",
"page_idx": 2
},
{
"type": "table",
"img_path": "images/e3cb413394a475e555807ffdad913435940ec637873d673ee1b039e3bc3496d0.jpg",
"table_caption": [
"Table 2 Significance of the rainfall and time terms "
],
"table_footnote": [
"indicates that the rainfall term was significant at the $5 \\%$ level, $T$ indicates that the time term was significant at the $5 \\%$ level, \\* represents significance at the $10 \\%$ level, and na denotes too few data points for meaningful analysis. "
],
"table_body": "<html><body><table><tr><td rowspan=\"2\">Site</td><td colspan=\"10\">Percentile</td></tr><tr><td>10</td><td>20</td><td>30</td><td>40</td><td>50</td><td>60</td><td>70</td><td>80</td><td>90</td><td>100</td></tr><tr><td>Traralgon Ck</td><td>P</td><td>P,*</td><td>P</td><td>P</td><td>P,</td><td>P,</td><td>P,</td><td>P,</td><td>P</td><td>P</td></tr><tr><td>Redhill</td><td>P,T</td><td>P,T</td><td>,*</td><td>**</td><td>P.T</td><td>P,*</td><td>P*</td><td>P*</td><td>*</td><td>,*</td></tr><tr><td>Pine Ck</td><td></td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>T</td><td>T</td><td>na</td><td>na</td></tr><tr><td>Stewarts Ck 5</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P.T</td><td>P.T</td><td>P,T</td><td>na</td><td>na</td><td>na</td></tr><tr><td>Glendhu 2</td><td>P</td><td>P,T</td><td>P,*</td><td>P,T</td><td>P.T</td><td>P,ns</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td></tr><tr><td>Cathedral Peak 2</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>*,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td></tr><tr><td>Cathedral Peak 3</td><td>P.T</td><td>P.T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td></tr><tr><td>Lambrechtsbos A</td><td>P,T</td><td>P</td><td>P</td><td>P,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>*,T</td><td>T</td></tr><tr><td>Lambrechtsbos B</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>P,T</td><td>T</td><td>T</td></tr><tr><td>Biesievlei</td><td>P,T</td><td>P.T</td><td>P,T</td><td>P,T</td><td>*,T</td><td>*,T</td><td>T</td><td>T</td><td>P,T</td><td>P,T</td></tr></table></body></html>",
"page_idx": 5
}
]
```
\ No newline at end of file
...@@ -9,7 +9,7 @@ from ...utils.config_reader import get_formula_enable, get_table_enable ...@@ -9,7 +9,7 @@ from ...utils.config_reader import get_formula_enable, get_table_enable
from ...utils.model_utils import crop_img, get_res_list_from_layout_res from ...utils.model_utils import crop_img, get_res_list_from_layout_res
from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence
YOLO_LAYOUT_BASE_BATCH_SIZE = 1 YOLO_LAYOUT_BASE_BATCH_SIZE = 8
MFD_BASE_BATCH_SIZE = 1 MFD_BASE_BATCH_SIZE = 1
MFR_BASE_BATCH_SIZE = 16 MFR_BASE_BATCH_SIZE = 16
...@@ -318,6 +318,13 @@ class BatchAnalyze: ...@@ -318,6 +318,13 @@ class BatchAnalyze:
layout_res_item['score'] = float(f"{ocr_score:.3f}") layout_res_item['score'] = float(f"{ocr_score:.3f}")
if ocr_score < OcrConfidence.min_confidence: if ocr_score < OcrConfidence.min_confidence:
layout_res_item['category_id'] = 16 layout_res_item['category_id'] = 16
else:
layout_res_bbox = [layout_res_item['poly'][0], layout_res_item['poly'][1],
layout_res_item['poly'][4], layout_res_item['poly'][5]]
layout_res_width = layout_res_bbox[2] - layout_res_bbox[0]
layout_res_height = layout_res_bbox[3] - layout_res_bbox[1]
if ocr_text in ['(204号', '(20', '(2', '(2号', '(20号'] and ocr_score < 0.8 and layout_res_width < layout_res_height:
layout_res_item['category_id'] = 16
total_processed += len(img_crop_list) total_processed += len(img_crop_list)
......
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import os
import time import time
from loguru import logger from loguru import logger
...@@ -151,9 +152,6 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -151,9 +152,6 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""对block进行fix操作""" """对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans) fix_blocks = fix_block_spans(block_with_spans)
"""同一行被断开的titile合并"""
# merge_title_blocks(fix_blocks)
"""对block进行排序""" """对block进行排序"""
sorted_blocks = sort_blocks_by_bbox(fix_blocks, page_w, page_h, footnote_blocks) sorted_blocks = sort_blocks_by_bbox(fix_blocks, page_w, page_h, footnote_blocks)
...@@ -235,7 +233,8 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N ...@@ -235,7 +233,8 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
"""清理内存""" """清理内存"""
pdf_doc.close() pdf_doc.close()
clean_memory(get_device()) if os.getenv('MINERU_DONOT_CLEAN_MEM') is None and len(model_list) >= 10:
clean_memory(get_device())
return middle_json return middle_json
......
...@@ -365,8 +365,12 @@ def para_split(page_info_list): ...@@ -365,8 +365,12 @@ def para_split(page_info_list):
for page_info in page_info_list: for page_info in page_info_list:
page_info['para_blocks'] = [] page_info['para_blocks'] = []
for block in all_blocks: for block in all_blocks:
if block['page_num'] == page_info['page_idx']: if 'page_num' in block:
page_info['para_blocks'].append(block) if block['page_num'] == page_info['page_idx']:
page_info['para_blocks'].append(block)
# 从block中删除不需要的page_num和page_size字段
del block['page_num']
del block['page_size']
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -75,9 +75,9 @@ def doc_analyze( ...@@ -75,9 +75,9 @@ def doc_analyze(
): ):
""" """
适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量, 适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量,
可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为100 可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为128
""" """
min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100)) min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 128))
# 收集所有页面信息 # 收集所有页面信息
all_pages_info = [] # 存储(dataset_index, page_index, img, ocr, lang, width, height) all_pages_info = [] # 存储(dataset_index, page_index, img, ocr, lang, width, height)
......
from mineru.utils.boxbase import bbox_relative_pos, calculate_iou, bbox_distance, is_in from mineru.utils.boxbase import bbox_relative_pos, calculate_iou, bbox_distance, is_in, get_minbox_if_overlap_by_ratio
from mineru.utils.enum_class import CategoryId, ContentType from mineru.utils.enum_class import CategoryId, ContentType
...@@ -13,7 +13,62 @@ class MagicModel: ...@@ -13,7 +13,62 @@ class MagicModel:
self.__fix_by_remove_low_confidence() self.__fix_by_remove_low_confidence()
"""删除高iou(>0.9)数据中置信度较低的那个""" """删除高iou(>0.9)数据中置信度较低的那个"""
self.__fix_by_remove_high_iou_and_low_confidence() self.__fix_by_remove_high_iou_and_low_confidence()
"""将部分tbale_footnote修正为image_footnote"""
self.__fix_footnote() self.__fix_footnote()
"""处理重叠的image_body和table_body"""
self.__fix_by_remove_overlap_image_table_body()
def __fix_by_remove_overlap_image_table_body(self):
need_remove_list = []
layout_dets = self.__page_model_info['layout_dets']
image_blocks = list(filter(
lambda x: x['category_id'] == CategoryId.ImageBody, layout_dets
))
table_blocks = list(filter(
lambda x: x['category_id'] == CategoryId.TableBody, layout_dets
))
def add_need_remove_block(blocks):
for i in range(len(blocks)):
for j in range(i + 1, len(blocks)):
block1 = blocks[i]
block2 = blocks[j]
overlap_box = get_minbox_if_overlap_by_ratio(
block1['bbox'], block2['bbox'], 0.8
)
if overlap_box is not None:
# 判断哪个区块的面积更小,移除较小的区块
area1 = (block1['bbox'][2] - block1['bbox'][0]) * (block1['bbox'][3] - block1['bbox'][1])
area2 = (block2['bbox'][2] - block2['bbox'][0]) * (block2['bbox'][3] - block2['bbox'][1])
if area1 <= area2:
block_to_remove = block1
large_block = block2
else:
block_to_remove = block2
large_block = block1
if block_to_remove not in need_remove_list:
# 扩展大区块的边界框
x1, y1, x2, y2 = large_block['bbox']
sx1, sy1, sx2, sy2 = block_to_remove['bbox']
x1 = min(x1, sx1)
y1 = min(y1, sy1)
x2 = max(x2, sx2)
y2 = max(y2, sy2)
large_block['bbox'] = [x1, y1, x2, y2]
need_remove_list.append(block_to_remove)
# 处理图像-图像重叠
add_need_remove_block(image_blocks)
# 处理表格-表格重叠
add_need_remove_block(table_blocks)
# 从布局中移除标记的区块
for need_remove in need_remove_list:
if need_remove in layout_dets:
layout_dets.remove(need_remove)
def __fix_axis(self): def __fix_axis(self):
need_remove_list = [] need_remove_list = []
...@@ -46,42 +101,46 @@ class MagicModel: ...@@ -46,42 +101,46 @@ class MagicModel:
def __fix_by_remove_high_iou_and_low_confidence(self): def __fix_by_remove_high_iou_and_low_confidence(self):
need_remove_list = [] need_remove_list = []
layout_dets = self.__page_model_info['layout_dets'] layout_dets = list(filter(
for layout_det1 in layout_dets: lambda x: x['category_id'] in [
for layout_det2 in layout_dets: CategoryId.Title,
if layout_det1 == layout_det2: CategoryId.Text,
continue CategoryId.ImageBody,
if layout_det1['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: CategoryId.ImageCaption,
if ( CategoryId.TableBody,
calculate_iou(layout_det1['bbox'], layout_det2['bbox']) CategoryId.TableCaption,
> 0.9 CategoryId.TableFootnote,
): CategoryId.InterlineEquation_Layout,
if layout_det1['score'] < layout_det2['score']: CategoryId.InterlineEquationNumber_Layout,
layout_det_need_remove = layout_det1 ], self.__page_model_info['layout_dets']
else: )
layout_det_need_remove = layout_det2 )
for i in range(len(layout_dets)):
for j in range(i + 1, len(layout_dets)):
layout_det1 = layout_dets[i]
layout_det2 = layout_dets[j]
if calculate_iou(layout_det1['bbox'], layout_det2['bbox']) > 0.9:
layout_det_need_remove = layout_det1 if layout_det1['score'] < layout_det2['score'] else layout_det2
if layout_det_need_remove not in need_remove_list:
need_remove_list.append(layout_det_need_remove)
if layout_det_need_remove not in need_remove_list:
need_remove_list.append(layout_det_need_remove)
else:
continue
else:
continue
for need_remove in need_remove_list: for need_remove in need_remove_list:
layout_dets.remove(need_remove) self.__page_model_info['layout_dets'].remove(need_remove)
def __fix_footnote(self): def __fix_footnote(self):
# 3: figure, 5: table, 7: footnote
footnotes = [] footnotes = []
figures = [] figures = []
tables = [] tables = []
for obj in self.__page_model_info['layout_dets']: for obj in self.__page_model_info['layout_dets']:
if obj['category_id'] == 7: if obj['category_id'] == CategoryId.TableFootnote:
footnotes.append(obj) footnotes.append(obj)
elif obj['category_id'] == 3: elif obj['category_id'] == CategoryId.ImageBody:
figures.append(obj) figures.append(obj)
elif obj['category_id'] == 5: elif obj['category_id'] == CategoryId.TableBody:
tables.append(obj) tables.append(obj)
if len(footnotes) * len(figures) == 0: if len(footnotes) * len(figures) == 0:
continue continue
...@@ -314,10 +373,10 @@ class MagicModel: ...@@ -314,10 +373,10 @@ class MagicModel:
def get_imgs(self): def get_imgs(self):
with_captions = self.__tie_up_category_by_distance_v3( with_captions = self.__tie_up_category_by_distance_v3(
3, 4 CategoryId.ImageBody, CategoryId.ImageCaption
) )
with_footnotes = self.__tie_up_category_by_distance_v3( with_footnotes = self.__tie_up_category_by_distance_v3(
3, CategoryId.ImageFootnote CategoryId.ImageBody, CategoryId.ImageFootnote
) )
ret = [] ret = []
for v in with_captions: for v in with_captions:
...@@ -333,10 +392,10 @@ class MagicModel: ...@@ -333,10 +392,10 @@ class MagicModel:
def get_tables(self) -> list: def get_tables(self) -> list:
with_captions = self.__tie_up_category_by_distance_v3( with_captions = self.__tie_up_category_by_distance_v3(
5, 6 CategoryId.TableBody, CategoryId.TableCaption
) )
with_footnotes = self.__tie_up_category_by_distance_v3( with_footnotes = self.__tie_up_category_by_distance_v3(
5, 7 CategoryId.TableBody, CategoryId.TableFootnote
) )
ret = [] ret = []
for v in with_captions: for v in with_captions:
...@@ -385,20 +444,21 @@ class MagicModel: ...@@ -385,20 +444,21 @@ class MagicModel:
all_spans = [] all_spans = []
layout_dets = self.__page_model_info['layout_dets'] layout_dets = self.__page_model_info['layout_dets']
allow_category_id_list = [3, 5, 13, 14, 15] allow_category_id_list = [
CategoryId.ImageBody,
CategoryId.TableBody,
CategoryId.InlineEquation,
CategoryId.InterlineEquation_YOLO,
CategoryId.OcrText,
]
"""当成span拼接的""" """当成span拼接的"""
# 3: 'image', # 图片
# 5: 'table', # 表格
# 13: 'inline_equation', # 行内公式
# 14: 'interline_equation', # 行间公式
# 15: 'text', # ocr识别文本
for layout_det in layout_dets: for layout_det in layout_dets:
category_id = layout_det['category_id'] category_id = layout_det['category_id']
if category_id in allow_category_id_list: if category_id in allow_category_id_list:
span = {'bbox': layout_det['bbox'], 'score': layout_det['score']} span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
if category_id == 3: if category_id == CategoryId.ImageBody:
span['type'] = ContentType.IMAGE span['type'] = ContentType.IMAGE
elif category_id == 5: elif category_id == CategoryId.TableBody:
# 获取table模型结果 # 获取table模型结果
latex = layout_det.get('latex', None) latex = layout_det.get('latex', None)
html = layout_det.get('html', None) html = layout_det.get('html', None)
...@@ -407,13 +467,13 @@ class MagicModel: ...@@ -407,13 +467,13 @@ class MagicModel:
elif html: elif html:
span['html'] = html span['html'] = html
span['type'] = ContentType.TABLE span['type'] = ContentType.TABLE
elif category_id == 13: elif category_id == CategoryId.InlineEquation:
span['content'] = layout_det['latex'] span['content'] = layout_det['latex']
span['type'] = ContentType.INLINE_EQUATION span['type'] = ContentType.INLINE_EQUATION
elif category_id == 14: elif category_id == CategoryId.InterlineEquation_YOLO:
span['content'] = layout_det['latex'] span['content'] = layout_det['latex']
span['type'] = ContentType.INTERLINE_EQUATION span['type'] = ContentType.INTERLINE_EQUATION
elif category_id == 15: elif category_id == CategoryId.OcrText:
span['content'] = layout_det['text'] span['content'] = layout_det['text']
span['type'] = ContentType.TEXT span['type'] = ContentType.TEXT
all_spans.append(span) all_spans.append(span)
...@@ -438,4 +498,4 @@ class MagicModel: ...@@ -438,4 +498,4 @@ class MagicModel:
for col in extra_col: for col in extra_col:
block[col] = item.get(col, None) block[col] = item.get(col, None)
blocks.append(block) blocks.append(block)
return blocks return blocks
\ No newline at end of file
...@@ -157,9 +157,11 @@ def merge_para_with_text(para_block): ...@@ -157,9 +157,11 @@ def merge_para_with_text(para_block):
if span_type == ContentType.TEXT: if span_type == ContentType.TEXT:
content = escape_special_markdown_char(span['content']) content = escape_special_markdown_char(span['content'])
elif span_type == ContentType.INLINE_EQUATION: elif span_type == ContentType.INLINE_EQUATION:
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}" if span.get('content', ''):
content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
elif span_type == ContentType.INTERLINE_EQUATION: elif span_type == ContentType.INTERLINE_EQUATION:
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n" if span.get('content', ''):
content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
content = content.strip() content = content.strip()
...@@ -191,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx): ...@@ -191,12 +193,12 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
para_content = {} para_content = {}
if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]: if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
para_content = { para_content = {
'type': 'text', 'type': ContentType.TEXT,
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
} }
elif para_type == BlockType.TITLE: elif para_type == BlockType.TITLE:
para_content = { para_content = {
'type': 'text', 'type': ContentType.TEXT,
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
} }
title_level = get_title_level(para_block) title_level = get_title_level(para_block)
...@@ -206,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx): ...@@ -206,14 +208,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0: if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
return None return None
para_content = { para_content = {
'type': 'equation', 'type': ContentType.EQUATION,
'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}", 'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
} }
if para_block['lines'][0]['spans'][0].get('content', ''): if para_block['lines'][0]['spans'][0].get('content', ''):
para_content['text'] = merge_para_with_text(para_block) para_content['text'] = merge_para_with_text(para_block)
para_content['text_format'] = 'latex' para_content['text_format'] = 'latex'
elif para_type == BlockType.IMAGE: elif para_type == BlockType.IMAGE:
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []} para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.IMAGE_BODY: if block['type'] == BlockType.IMAGE_BODY:
for line in block['lines']: for line in block['lines']:
...@@ -222,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx): ...@@ -222,29 +224,26 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if span.get('image_path', ''): if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.IMAGE_CAPTION: if block['type'] == BlockType.IMAGE_CAPTION:
para_content['img_caption'].append(merge_para_with_text(block)) para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.IMAGE_FOOTNOTE: if block['type'] == BlockType.IMAGE_FOOTNOTE:
para_content['img_footnote'].append(merge_para_with_text(block)) para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
elif para_type == BlockType.TABLE: elif para_type == BlockType.TABLE:
para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []} para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.TABLE_BODY: if block['type'] == BlockType.TABLE_BODY:
for line in block['lines']: for line in block['lines']:
for span in line['spans']: for span in line['spans']:
if span['type'] == ContentType.TABLE: if span['type'] == ContentType.TABLE:
if span.get('html', ''):
if span.get('latex', ''): para_content[BlockType.TABLE_BODY] = f"{span['html']}"
para_content['table_body'] = f"{span['latex']}"
elif span.get('html', ''):
para_content['table_body'] = f"{span['html']}"
if span.get('image_path', ''): if span.get('image_path', ''):
para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
if block['type'] == BlockType.TABLE_CAPTION: if block['type'] == BlockType.TABLE_CAPTION:
para_content['table_caption'].append(merge_para_with_text(block)) para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
if block['type'] == BlockType.TABLE_FOOTNOTE: if block['type'] == BlockType.TABLE_FOOTNOTE:
para_content['table_footnote'].append(merge_para_with_text(block)) para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
para_content['page_idx'] = page_idx para_content['page_idx'] = page_idx
......
...@@ -77,7 +77,7 @@ def get_predictor( ...@@ -77,7 +77,7 @@ def get_predictor(
raise ImportError( raise ImportError(
"sglang is not installed, so sglang-engine backend cannot be used. " "sglang is not installed, so sglang-engine backend cannot be used. "
"If you need to use sglang-engine backend for inference, " "If you need to use sglang-engine backend for inference, "
"please install sglang[all]==0.4.7 or a newer version." "please install sglang[all]==0.4.8 or a newer version."
) )
predictor = SglangEnginePredictor( predictor = SglangEnginePredictor(
server_args=ServerArgs(model_path, **kwargs), server_args=ServerArgs(model_path, **kwargs),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment