Unverified Commit 6ab12348 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2625 from opendatalab/release-2.0.0

Release 2.0.0
parents 9487d33d 4fbec469
import re
import itertools
import html
from typing import Any, Dict, List
from pydantic import (
BaseModel,
computed_field,
model_validator,
)
class TableCell(BaseModel):
"""TableCell."""
row_span: int = 1
col_span: int = 1
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False
@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
"""from_dict_format."""
if isinstance(data, Dict):
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
if (
# "bbox" not in data
# or data["bbox"] is None
# or isinstance(data["bbox"], BoundingBox)
"text"
in data
):
return data
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "
text = text.strip()
data["text"] = text
return data
class TableData(BaseModel): # TBD
"""BaseTableData."""
table_cells: List[TableCell] = []
num_rows: int = 0
num_cols: int = 0
@computed_field # type: ignore
@property
def grid(
self,
) -> List[List[TableCell]]:
"""grid."""
# Initialise empty table data grid (only empty cells)
table_data = [
[
TableCell(
text="",
start_row_offset_idx=i,
end_row_offset_idx=i + 1,
start_col_offset_idx=j,
end_col_offset_idx=j + 1,
)
for j in range(self.num_cols)
]
for i in range(self.num_rows)
]
# Overwrite cells in table data for which there is actual cell content.
for cell in self.table_cells:
for i in range(
min(cell.start_row_offset_idx, self.num_rows),
min(cell.end_row_offset_idx, self.num_rows),
):
for j in range(
min(cell.start_col_offset_idx, self.num_cols),
min(cell.end_col_offset_idx, self.num_cols),
):
table_data[i][j] = cell
return table_data
"""
OTSL
"""
OTSL_NL = "<nl>"
OTSL_FCEL = "<fcel>"
OTSL_ECEL = "<ecel>"
OTSL_LCEL = "<lcel>"
OTSL_UCEL = "<ucel>"
OTSL_XCEL = "<xcel>"
def otsl_extract_tokens_and_text(s: str):
# Pattern to match anything enclosed by < >
# (including the angle brackets themselves)
# pattern = r"(<[^>]+>)"
pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
tokens = re.findall(pattern, s)
# Remove any tokens that start with "<loc_"
tokens = [token for token in tokens]
# Split the string by those tokens to get the in-between text
text_parts = re.split(pattern, s)
text_parts = [token for token in text_parts]
# Remove any empty or purely whitespace strings from text_parts
text_parts = [part for part in text_parts if part.strip()]
return tokens, text_parts
def otsl_parse_texts(texts, tokens):
split_word = OTSL_NL
split_row_tokens = [
list(y)
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
if not x
]
table_cells = []
r_idx = 0
c_idx = 0
def count_right(tokens, c_idx, r_idx, which_tokens):
span = 0
c_idx_iter = c_idx
while tokens[r_idx][c_idx_iter] in which_tokens:
c_idx_iter += 1
span += 1
if c_idx_iter >= len(tokens[r_idx]):
return span
return span
def count_down(tokens, c_idx, r_idx, which_tokens):
span = 0
r_idx_iter = r_idx
while tokens[r_idx_iter][c_idx] in which_tokens:
r_idx_iter += 1
span += 1
if r_idx_iter >= len(tokens):
return span
return span
for i, text in enumerate(texts):
cell_text = ""
if text in [
OTSL_FCEL,
OTSL_ECEL,
]:
row_span = 1
col_span = 1
right_offset = 1
if text != OTSL_ECEL:
cell_text = texts[i + 1]
right_offset = 2
# Check next element(s) for lcel / ucel / xcel,
# set properly row_span, col_span
next_right_cell = ""
if i + right_offset < len(texts):
next_right_cell = texts[i + right_offset]
next_bottom_cell = ""
if r_idx + 1 < len(split_row_tokens):
if c_idx < len(split_row_tokens[r_idx + 1]):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
if next_right_cell in [
OTSL_LCEL,
OTSL_XCEL,
]:
# we have horisontal spanning cell or 2d spanning cell
col_span += count_right(
split_row_tokens,
c_idx + 1,
r_idx,
[OTSL_LCEL, OTSL_XCEL],
)
if next_bottom_cell in [
OTSL_UCEL,
OTSL_XCEL,
]:
# we have a vertical spanning cell or 2d spanning cell
row_span += count_down(
split_row_tokens,
c_idx,
r_idx + 1,
[OTSL_UCEL, OTSL_XCEL],
)
table_cells.append(
TableCell(
text=cell_text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=r_idx,
end_row_offset_idx=r_idx + row_span,
start_col_offset_idx=c_idx,
end_col_offset_idx=c_idx + col_span,
)
)
if text in [
OTSL_FCEL,
OTSL_ECEL,
OTSL_LCEL,
OTSL_UCEL,
OTSL_XCEL,
]:
c_idx += 1
if text == OTSL_NL:
r_idx += 1
c_idx = 0
return table_cells, split_row_tokens
def export_to_html(table_data: TableData):
nrows = table_data.num_rows
ncols = table_data.num_cols
text = ""
if len(table_data.table_cells) == 0:
return ""
body = ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
cell: TableCell = table_data.grid[i][j]
rowspan, rowstart = (
cell.row_span,
cell.start_row_offset_idx,
)
colspan, colstart = (
cell.col_span,
cell.start_col_offset_idx,
)
if rowstart != i:
continue
if colstart != j:
continue
content = html.escape(cell.text.strip())
celltag = "td"
if cell.column_header:
celltag = "th"
opening_tag = f"{celltag}"
if rowspan > 1:
opening_tag += f' rowspan="{rowspan}"'
if colspan > 1:
opening_tag += f' colspan="{colspan}"'
body += f"<{opening_tag}>{content}</{celltag}>"
body += "</tr>"
# dir = get_text_direction(text)
body = f"<table>{body}</table>"
return body
def convert_otsl_to_html(otsl_content: str):
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
table_data = TableData(
num_rows=len(split_row_tokens),
num_cols=(
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
),
table_cells=table_cells,
)
return export_to_html(table_data)
# Copyright (c) Opendatalab. All rights reserved.
import hashlib
import json
def compute_md5(file_bytes):
def bytes_md5(file_bytes):
hasher = hashlib.md5()
hasher.update(file_bytes)
return hasher.hexdigest().upper()
def compute_sha256(input_string):
def str_md5(input_string):
hasher = hashlib.md5()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
def str_sha256(input_string):
hasher = hashlib.sha256()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
def dict_md5(d):
json_str = json.dumps(d, sort_keys=True, ensure_ascii=False)
return hashlib.md5(json_str.encode('utf-8')).hexdigest()
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import json
from loguru import logger
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from openai import OpenAI
import ast
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import merge_para_with_text
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
1. 修正渲染或编译错误:
- Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
- 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
2. 保留原始信息:
- 保留原始公式中的所有重要信息
- 不要添加任何原始公式中没有的新信息
IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。
LaTeX recognition result:
$FORMULA
Your corrected result:
"""
text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
1. 修正OCR引起的拼写错误和错误:
- 修正常见的OCR错误(例如,'rn' 被误读为 'm')
- 使用上下文和常识进行修正
- 只修正明显的错误,不要不必要的修改内容
- 不要添加额外的句号或其他不必要的标点符号
2. 保持原始结构:
- 保留所有标题和子标题
3. 保留原始内容:
- 保留原始文本中的所有重要信息
- 不要添加任何原始文本中没有的新信息
- 保留段落之间的换行符
4. 保持连贯性:
- 确保内容与前文顺畅连接
- 适当处理在句子中间开始或结束的文本
5. 修正行内公式:
- 去除行内公式前后多余的空格
- 修正公式中的OCR错误
- 确保公式能够通过KaTeX渲染
6. 修正全角字符
- 修正全角标点符号为半角标点符号
- 修正全角字母为半角字母
- 修正全角数字为半角数字
IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。
Previous context:
Current chunk to process:
Corrected text:
"""
def llm_aided_formula(pdf_info_dict, formula_aided_config):
pass
def llm_aided_text(pdf_info_dict, text_aided_config):
pass
def llm_aided_title(pdf_info_dict, title_aided_config):
def llm_aided_title(page_info_list, title_aided_config):
client = OpenAI(
api_key=title_aided_config["api_key"],
base_url=title_aided_config["base_url"],
......@@ -78,8 +14,8 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
title_dict = {}
origin_title_list = []
i = 0
for page_num, page in pdf_info_dict.items():
blocks = page["para_blocks"]
for page_info in page_info_list:
blocks = page_info["para_blocks"]
for block in blocks:
if block["type"] == "title":
origin_title_list.append(block)
......@@ -92,7 +28,7 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
line_avg_height = sum(page_line_height_list) / len(page_line_height_list)
else:
line_avg_height = int(block['bbox'][3] - block['bbox'][1])
title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1]
title_dict[f"{i}"] = [title_text, line_avg_height, int(page_info['page_idx']) + 1]
i += 1
# logger.info(f"Title list: {title_dict}")
......@@ -115,16 +51,21 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
- 标题从前至后的层级必须是连续的,不能跳过层级
- 标题层级最多为4级,不要添加过多的层级
- 优化后的标题只保留代表该标题的层级的整数,不要保留其他信息
5. 合理性检查与微调:
- 在完成初步分级后,仔细检查分级结果的合理性
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
- 确保最终的分级结果符合文档的实际结构和逻辑
- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
IMPORTANT:
请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
{{0:1,1:2,2:2,3:3}}
{{
0:1,
1:2,
2:2,
3:3
}}
不需要对字典格式化,不需要返回任何其他信息。
Input title list:
......@@ -145,16 +86,23 @@ Corrected title list:
{'role': 'user', 'content': title_optimize_prompt}],
temperature=0.7,
)
# logger.info(f"Title completion: {completion.choices[0].message.content}")
dict_completion = ast.literal_eval(completion.choices[0].message.content)
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
content = completion.choices[0].message.content.strip()
# logger.info(f"Title completion: {content}")
if "</think>" in content:
idx = content.index("</think>") + len("</think>")
content = content[idx:].strip()
import json_repair
dict_completion = json_repair.loads(content)
dict_completion = {int(k): int(v) for k, v in dict_completion.items()}
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
if len(dict_completion) == len(title_dict):
for i, origin_title_block in enumerate(origin_title_list):
origin_title_block["level"] = int(dict_completion[i])
break
else:
logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
logger.warning(
"The number of titles in the optimized result is not equal to the number of titles in the input.")
retry_count += 1
except Exception as e:
logger.exception(e)
......
import time
import torch
import gc
from PIL import Image
from loguru import logger
import numpy as np
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
from magic_pdf.libs.clean_memory import clean_memory
from mineru.utils.boxbase import get_minbox_if_overlap_by_ratio
def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
def crop_img(input_res, input_img, crop_paste_x=0, crop_paste_y=0):
crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
......@@ -16,15 +17,24 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
# Create a white background array
return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
if isinstance(input_img, np.ndarray):
# Crop the original image using numpy slicing
cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
# Create a white background array
return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
# Paste the cropped image onto the white background
return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
# Crop the original image using numpy slicing
cropped_img = input_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
# Paste the cropped image onto the white background
return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
else:
# Create a white background array
return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
# Crop image
crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
cropped_img = input_img.crop(crop_box)
return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
crop_new_height]
......@@ -287,6 +297,20 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
def clean_memory(device='cuda'):
if device == 'cuda':
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
elif str(device).startswith("npu"):
import torch_npu
if torch_npu.npu.is_available():
torch_npu.npu.empty_cache()
elif str(device).startswith("mps"):
torch.mps.empty_cache()
gc.collect()
def clean_vram(device, vram_threshold=8):
total_memory = get_vram(device)
if total_memory and total_memory <= vram_threshold:
......
import os
from huggingface_hub import snapshot_download as hf_snapshot_download
from modelscope import snapshot_download as ms_snapshot_download
from mineru.utils.config_reader import get_local_models_dir
from mineru.utils.enum_class import ModelPath
def auto_download_and_get_model_root_path(relative_path: str, repo_mode='pipeline') -> str:
"""
支持文件或目录的可靠下载。
- 如果输入文件: 返回本地文件绝对路径
- 如果输入目录: 返回本地缓存下与 relative_path 同结构的相对路径字符串
:param repo_mode: 指定仓库模式,'pipeline' 或 'vlm'
:param relative_path: 文件或目录相对路径
:return: 本地文件绝对路径或相对路径
"""
model_source = os.getenv('MINERU_MODEL_SOURCE', "huggingface")
if model_source == 'local':
local_models_config = get_local_models_dir()
root_path = local_models_config.get(repo_mode, None)
if not root_path:
raise ValueError(f"Local path for repo_mode '{repo_mode}' is not configured.")
return root_path
# 建立仓库模式到路径的映射
repo_mapping = {
'pipeline': {
'huggingface': ModelPath.pipeline_root_hf,
'modelscope': ModelPath.pipeline_root_modelscope,
'default': ModelPath.pipeline_root_hf
},
'vlm': {
'huggingface': ModelPath.vlm_root_hf,
'modelscope': ModelPath.vlm_root_modelscope,
'default': ModelPath.vlm_root_hf
}
}
if repo_mode not in repo_mapping:
raise ValueError(f"Unsupported repo_mode: {repo_mode}, must be 'pipeline' or 'vlm'")
# 如果没有指定model_source或值不是'modelscope',则使用默认值
repo = repo_mapping[repo_mode].get(model_source, repo_mapping[repo_mode]['default'])
if model_source == "huggingface":
snapshot_download = hf_snapshot_download
elif model_source == "modelscope":
snapshot_download = ms_snapshot_download
else:
raise ValueError(f"未知的仓库类型: {model_source}")
cache_dir = None
if repo_mode == 'pipeline':
relative_path = relative_path.strip('/')
cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"])
elif repo_mode == 'vlm':
# VLM 模式下,直接下载整个模型目录
cache_dir = snapshot_download(repo)
if not cache_dir:
raise FileNotFoundError(f"Failed to download model: {relative_path} from {repo}")
return cache_dir
if __name__ == '__main__':
path1 = "models/README.md"
root = auto_download_and_get_model_root_path(path1)
print("本地文件绝对路径:", os.path.join(root, path1))
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import copy
import cv2
import numpy as np
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
class OcrConfidence:
min_confidence = 0.68
min_width = 3
def merge_spans_to_line(spans, threshold=0.6):
if len(spans) == 0:
return []
else:
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line)
return lines
def __is_overlaps_y_exceeds_threshold(bbox1,
bbox2,
overlap_ratio_threshold=0.8):
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
_, y0_1, _, y1_1 = bbox1
_, y0_2, _, y1_2 = bbox2
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
# max_height = max(height1, height2)
min_height = min(height1, height2)
return (overlap / min_height) > overlap_ratio_threshold
def img_decode(content: bytes):
......@@ -212,10 +253,7 @@ def merge_det_boxes(dt_boxes):
angle_boxes_list.append(text_box)
continue
text_box_dict = {
'bbox': text_bbox,
'type': 'text',
}
text_box_dict = {'bbox': text_bbox}
dt_boxes_dict_list.append(text_box_dict)
# Merge adjacent text regions into lines
......@@ -271,7 +309,7 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
p1, p2, p3, p4 = box_ocr_res[0]
text, score = box_ocr_res[1]
# logger.info(f"text: {text}, score: {score}")
if score < 0.6: # 过滤低置信度的结果
if score < OcrConfidence.min_confidence: # 过滤低置信度的结果
continue
else:
p1, p2, p3, p4 = box_ocr_res
......@@ -284,6 +322,11 @@ def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang):
# average_angle_degrees = calculate_angle_degrees(box_ocr_res[0])
# if average_angle_degrees > 0.5:
poly = [p1, p2, p3, p4]
if (p3[0] - p1[0]) < OcrConfidence.min_width:
# logger.info(f"width too small: {p3[0] - p1[0]}, text: {text}")
continue
if calculate_is_angle(poly):
# logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}")
# 与x轴的夹角超过0.5度,对边界做一下矫正
......
# Copyright (c) Opendatalab. All rights reserved.
import re
from io import BytesIO
import numpy as np
import pypdfium2 as pdfium
from loguru import logger
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams, LTImage, LTFigure
from pdfminer.converter import PDFPageAggregator
def classify(pdf_bytes):
"""
判断PDF文件是可以直接提取文本还是需要OCR
Args:
pdf_bytes: PDF文件的字节数据
Returns:
str: 'txt' 表示可以直接提取文本,'ocr' 表示需要OCR
"""
try:
# 从字节数据加载PDF
sample_pdf_bytes = extract_pages(pdf_bytes)
pdf = pdfium.PdfDocument(sample_pdf_bytes)
# 获取PDF页数
page_count = len(pdf)
# 如果PDF页数为0,直接返回OCR
if page_count == 0:
return 'ocr'
# 检查的页面数(最多检查10页)
pages_to_check = min(page_count, 10)
# 设置阈值:如果每页平均少于50个有效字符,认为需要OCR
chars_threshold = 50
if (get_avg_cleaned_chars_per_page(pdf, pages_to_check) < chars_threshold) or detect_invalid_chars(sample_pdf_bytes):
return 'ocr'
else:
if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
return 'ocr'
return 'txt'
except Exception as e:
logger.error(f"判断PDF类型时出错: {e}")
# 出错时默认使用OCR
return 'ocr'
def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
# 总字符数
total_chars = 0
# 清理后的总字符数
cleaned_total_chars = 0
# 检查前几页的文本
for i in range(pages_to_check):
page = pdf_doc[i]
text_page = page.get_textpage()
text = text_page.get_text_bounded()
total_chars += len(text)
# 清理提取的文本,移除空白字符
cleaned_text = re.sub(r'\s+', '', text)
cleaned_total_chars += len(cleaned_text)
# 计算平均每页字符数
avg_cleaned_chars_per_page = cleaned_total_chars / pages_to_check
# logger.debug(f"PDF分析: 平均每页清理后{avg_cleaned_chars_per_page:.1f}字符")
pdf_doc.close() # 关闭PDF文档
return avg_cleaned_chars_per_page
def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
# 创建内存文件对象
pdf_stream = BytesIO(sample_pdf_bytes)
# 创建PDF解析器
parser = PDFParser(pdf_stream)
# 创建PDF文档对象
document = PDFDocument(parser)
# 检查文档是否允许文本提取
if not document.is_extractable:
# logger.warning("PDF不允许内容提取")
return 1.0 # 默认为高覆盖率,因为无法提取内容
# 创建资源管理器和参数对象
rsrcmgr = PDFResourceManager()
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
# 创建聚合器
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建解释器
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 记录高图像覆盖率的页面数量
high_image_coverage_pages = 0
page_count = 0
# 遍历页面
for page in PDFPage.create_pages(document):
# 控制检查的页数
if page_count >= pages_to_check:
break
# 处理页面
interpreter.process_page(page)
layout = device.get_result()
# 页面尺寸
page_width = layout.width
page_height = layout.height
page_area = page_width * page_height
# 计算图像覆盖的总面积
image_area = 0
# 遍历页面元素
for element in layout:
# 检查是否为图像或图形元素
if isinstance(element, (LTImage, LTFigure)):
# 计算图像边界框面积
img_width = element.width
img_height = element.height
img_area = img_width * img_height
image_area += img_area
# 计算覆盖率
coverage_ratio = min(image_area / page_area, 1.0) if page_area > 0 else 0
# logger.debug(f"PDF分析: 页面 {page_count + 1} 图像覆盖率: {coverage_ratio:.2f}")
# 判断是否为高覆盖率
if coverage_ratio >= 0.8: # 使用80%作为高覆盖率的阈值
high_image_coverage_pages += 1
page_count += 1
# 如果没有处理任何页面,返回0
if page_count == 0:
return 0.0
# 计算高图像覆盖率的页面比例
high_coverage_ratio = high_image_coverage_pages / page_count
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_coverage_ratio:.2f}")
# 关闭资源
pdf_stream.close()
return high_coverage_ratio
def extract_pages(src_pdf_bytes: bytes) -> bytes:
"""
从PDF字节数据中随机提取最多10页,返回新的PDF字节数据
Args:
src_pdf_bytes: PDF文件的字节数据
Returns:
bytes: 提取页面后的PDF字节数据
"""
# 从字节数据加载PDF
pdf = pdfium.PdfDocument(src_pdf_bytes)
# 获取PDF页数
total_page = len(pdf)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return b''
# 选择最多10页
select_page_cnt = min(10, total_page)
# 从总页数中随机选择页面
page_indices = np.random.choice(total_page, select_page_cnt, replace=False).tolist()
# 创建一个新的PDF文档
sample_docs = pdfium.PdfDocument.new()
try:
# 将选择的页面导入新文档
sample_docs.import_pages(pdf, page_indices)
# 将新PDF保存到内存缓冲区
output_buffer = BytesIO()
sample_docs.save(output_buffer)
# 获取字节数据
return output_buffer.getvalue()
except Exception as e:
logger.exception(e)
return b'' # 出错时返回空字节
def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_pdf_bytes = extract_pages(src_pdf_bytes)
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0:
cid_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.debug(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
return True # 乱码文档
else:
return False # 正常文档
if __name__ == '__main__':
with open('/Users/myhloli/pdf/luanma2x10.pdf', 'rb') as f:
p_bytes = f.read()
logger.info(f"PDF分类结果: {classify(p_bytes)}")
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
from io import BytesIO
import pypdfium2 as pdfium
from loguru import logger
from PIL import Image
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
from .hash_utils import str_sha256
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200) -> dict:
"""Convert pdfium.PdfDocument to image, Then convert the image to base64.
Args:
page (_type_): pdfium.PdfPage
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float }
"""
pil_img, scale = page_to_image(page, dpi=dpi)
img_base64 = image_to_b64str(pil_img)
image_dict = {
"img_base64": img_base64,
"img_pil": pil_img,
"scale": scale,
}
return image_dict
def load_images_from_pdf(
pdf_bytes: bytes,
dpi=200,
start_page_id=0,
end_page_id=None,
):
images_list = []
pdf_doc = pdfium.PdfDocument(pdf_bytes)
pdf_page_num = len(pdf_doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1
for index in range(0, pdf_page_num):
if start_page_id <= index <= end_page_id:
page = pdf_doc[index]
image_dict = pdf_page_to_image(page, dpi=dpi)
images_list.append(image_dict)
return images_list, pdf_doc
def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
图片存放在save_path下,文件名是:
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
# 拼接文件名
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 老版本返回不带bucket的路径
img_path = f"{return_path}_{filename}" if return_path is not None else None
# 新版本生成平铺路径
img_hash256_path = f"{str_sha256(img_path)}.jpg"
# img_hash256_path = f'{img_path}.jpg'
crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
img_bytes = image_to_bytes(crop_img, image_format="JPEG")
image_writer.write(img_hash256_path, img_bytes)
return img_hash256_path
def get_crop_img(bbox: tuple, pil_img, scale=2):
scale_bbox = (
int(bbox[0] * scale),
int(bbox[1] * scale),
int(bbox[2] * scale),
int(bbox[3] * scale),
)
return pil_img.crop(scale_bbox)
def images_bytes_to_pdf_bytes(image_bytes):
# 内存缓冲区
pdf_buffer = BytesIO()
# 载入并转换所有图像为 RGB 模式
image = Image.open(BytesIO(image_bytes)).convert("RGB")
# 第一张图保存为 PDF,其余追加
image.save(pdf_buffer, format="PDF", save_all=True)
# 获取 PDF bytes 并重置指针(可选)
pdf_bytes = pdf_buffer.getvalue()
pdf_buffer.close()
return pdf_bytes
# Copyright (c) Opendatalab. All rights reserved.
import base64
from io import BytesIO
from loguru import logger
from PIL import Image
from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
def page_to_image(
page: PdfPage,
dpi: int = 144, # changed from 200 to 144
max_width_or_height: int = 2560, # changed from 4500 to 2560
) -> (Image.Image, float):
scale = dpi / 72
long_side_length = max(*page.get_size())
if long_side_length > max_width_or_height:
scale = max_width_or_height / long_side_length
bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
try:
image = bitmap.to_pil()
finally:
try:
bitmap.close()
except Exception:
pass
return image, scale
def image_to_bytes(
image: Image.Image,
image_format: str = "PNG", # 也可以用 "JPEG"
) -> bytes:
with BytesIO() as image_buffer:
image.save(image_buffer, format=image_format)
return image_buffer.getvalue()
def image_to_b64str(
image: Image.Image,
image_format: str = "PNG", # 也可以用 "JPEG"
) -> str:
image_bytes = image_to_bytes(image, image_format)
return base64.b64encode(image_bytes).decode("utf-8")
def pdf_to_images(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
) -> list[Image.Image]:
doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
page_num = len(doc)
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
if end_page_id > page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = page_num - 1
images = []
try:
for i in range(start_page_id, end_page_id + 1):
image, _ = page_to_image(doc[i], dpi, max_width_or_height)
images.append(image)
finally:
try:
doc.close()
except Exception:
pass
return images
def pdf_to_images_bytes(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
image_format: str = "PNG",
) -> list[bytes]:
images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
return [image_to_bytes(image, image_format) for image in images]
def pdf_to_images_b64strs(
pdf: str | bytes | PdfDocument,
dpi: int = 144,
max_width_or_height: int = 2560,
start_page_id: int = 0,
end_page_id: int | None = None,
image_format: str = "PNG",
) -> list[str]:
images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
return [image_to_b64str(image, image_format) for image in images]
from typing import List
import math
import pypdfium2 as pdfium
from pdftext.pdf.chars import get_chars, deduplicate_chars
from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
def get_page(
page: pdfium.PdfPage,
quote_loosebox: bool =True,
superscript_height_threshold: float = 0.7,
line_distance_threshold: float = 0.1,
) -> dict:
textpage = page.get_textpage()
page_bbox: List[float] = page.get_bbox()
page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
page_rotation = 0
try:
page_rotation = page.get_rotation()
except:
pass
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
lines = get_lines(spans)
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
blocks = get_blocks(lines)
page = {
"bbox": page_bbox,
"width": page_width,
"height": page_height,
"rotation": page_rotation,
"blocks": blocks
}
return page
\ No newline at end of file
import asyncio
import threading
from queue import Queue
from typing import Any, AsyncIterable, Coroutine, Iterable, TypeVar
T = TypeVar("T")
def run_async(coroutine: Coroutine[Any, Any, T]) -> T:
if not asyncio.iscoroutine(coroutine):
raise ValueError("a coroutine was expected, got {!r}".format(coroutine))
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop is not None:
return loop.run_until_complete(coroutine)
else:
return asyncio.run(coroutine)
def iter_async(iterable: AsyncIterable[T]) -> Iterable[T]:
if not isinstance(iterable, AsyncIterable):
raise ValueError("an async iterable was expected, got {!r}".format(iterable))
queue = Queue()
async def async_helper():
try:
async for chunk in iterable:
queue.put(chunk)
queue.put(None)
except Exception as e:
queue.put(e)
def helper():
run_async(async_helper())
thread = threading.Thread(target=helper, daemon=True)
thread.start()
while True:
chunk = queue.get()
if chunk is None:
break
if isinstance(chunk, Exception):
raise chunk
yield chunk
thread.join()
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
# Copyright (c) Opendatalab. All rights reserved.
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from mineru.utils.enum_class import BlockType, ContentType
from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
if block_type in [
BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
]:
block_dict['group_id'] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(
span['type'], block_type):
block_spans.append(span)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
return block_type in [
BlockType.TEXT,
BlockType.TITLE,
BlockType.IMAGE_CAPTION,
BlockType.IMAGE_FOOTNOTE,
BlockType.TABLE_CAPTION,
BlockType.TABLE_FOOTNOTE,
BlockType.DISCARDED
]
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
elif span_type == ContentType.INTERLINE_EQUATION:
return block_type in [BlockType.INTERLINE_EQUATION, BlockType.TEXT]
elif span_type == ContentType.IMAGE:
return block_type in [BlockType.IMAGE_BODY]
elif span_type == ContentType.TABLE:
return block_type in [BlockType.TABLE_BODY]
else:
return False
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.INTERLINE_EQUATION:
span['type'] = ContentType.INLINE_EQUATION
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def merge_spans_to_line(spans, threshold=0.6):
......@@ -34,11 +91,11 @@ def merge_spans_to_line(spans, threshold=0.6):
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] for s in current_line):
# 则开始新行
lines.append(current_line)
......@@ -60,70 +117,36 @@ def merge_spans_to_line(spans, threshold=0.6):
return lines
def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.Text, ContentType.InlineEquation]:
return block_type in [
BlockType.Text,
BlockType.Title,
BlockType.ImageCaption,
BlockType.ImageFootnote,
BlockType.TableCaption,
BlockType.TableFootnote,
BlockType.Discarded
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
elif span_type == ContentType.InterlineEquation:
return block_type in [BlockType.InterlineEquation, BlockType.Text]
elif span_type == ContentType.Image:
return block_type in [BlockType.ImageBody]
elif span_type == ContentType.Table:
return block_type in [BlockType.TableBody]
else:
return False
def fill_spans_in_blocks(blocks, spans, radio):
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
block_bbox = block[0:4]
block_dict = {
'type': block_type,
'bbox': block_bbox,
}
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict['group_id'] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
block_spans.append(span)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
def fix_block_spans_v2(block_with_spans):
def fix_block_spans(block_with_spans):
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
if block_type in [BlockType.Text, BlockType.Title,
BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableCaption, BlockType.TableFootnote
if block_type in [BlockType.TEXT, BlockType.TITLE,
BlockType.IMAGE_CAPTION, BlockType.IMAGE_CAPTION,
BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
]:
block = fix_text_block(block)
elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
elif block_type in [BlockType.INTERLINE_EQUATION, BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
block = fix_interline_block(block)
else:
continue
......@@ -131,29 +154,9 @@ def fix_block_spans_v2(block_with_spans):
return fix_blocks
def fix_discarded_block(discarded_block_with_spans):
fix_discarded_blocks = []
for block in discarded_block_with_spans:
block = fix_text_block(block)
fix_discarded_blocks.append(block)
return fix_discarded_blocks
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == ContentType.InterlineEquation:
span['type'] = ContentType.InlineEquation
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
return block
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import re
import statistics
import cv2
import numpy as np
from loguru import logger
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \
get_minbox_if_overlap_by_ratio
from mineru.utils.enum_class import BlockType, ContentType
from mineru.utils.pdf_image_tools import get_crop_img
from mineru.utils.pdf_text_tool import get_page
def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
def get_block_bboxes(blocks, block_type_list):
return [block[0:4] for block in blocks if block[7] in block_type_list]
image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY])
table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY])
other_block_type = []
for block_type in BlockType.__dict__.values():
if not isinstance(block_type, str):
continue
if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
other_block_type.append(block_type)
other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED])
new_spans = []
for span in spans:
span_bbox = span['bbox']
span_type = span['type']
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
discarded_block_bboxes):
new_spans.append(span)
continue
if span_type == ContentType.IMAGE:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
image_bboxes):
new_spans.append(span)
elif span_type == ContentType.TABLE:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
table_bboxes):
new_spans.append(span)
else:
if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
other_block_bboxes):
new_spans.append(span)
return new_spans
def remove_overlaps_low_confidence_spans(spans):
dropped_spans = []
# 删除重叠spans中置信度低的的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
if span1['score'] < span2['score']:
span_need_remove = span1
else:
span_need_remove = span2
if (
span_need_remove is not None
and span_need_remove not in dropped_spans
):
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
return spans, dropped_spans
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans:
for span2 in spans:
if span1 != span2:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if span1 in dropped_spans or span2 in dropped_spans:
continue
else:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)
if len(dropped_spans) > 0:
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
return spans, dropped_spans
def __replace_ligatures(text: str):
ligatures = {
'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st'
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
def __replace_unicode(text: str):
ligatures = {
'\r\n': '', '\u0002': '-',
}
return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
"""pdf_text dict方案 char级别"""
def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded_blocks):
page_dict = get_page(pdf_page)
page_all_chars = []
page_all_lines = []
for block in page_dict['blocks']:
for line in block['lines']:
if 0 < abs(line['rotation']) < 90:
# 旋转角度在0-90度之间的行,直接跳过
continue
page_all_lines.append(line)
for span in line['spans']:
for char in span['chars']:
page_all_chars.append(char)
# 计算所有sapn的高度的中位数
span_height_list = []
for span in spans:
if span['type'] in [ContentType.TEXT]:
span_height = span['bbox'][3] - span['bbox'][1]
span['height'] = span_height
span['width'] = span['bbox'][2] - span['bbox'][0]
span_height_list.append(span_height)
if len(span_height_list) == 0:
return spans
else:
median_span_height = statistics.median(span_height_list)
useful_spans = []
unuseful_spans = []
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
vertical_spans = []
for span in spans:
if span['type'] in [ContentType.TEXT]:
for block in all_bboxes + all_discarded_blocks:
if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
continue
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
vertical_spans.append(span)
elif block in all_bboxes:
useful_spans.append(span)
else:
unuseful_spans.append(span)
break
"""垂直的span框直接用line进行填充"""
if len(vertical_spans) > 0:
for pdfium_line in page_all_lines:
for span in vertical_spans:
if calculate_overlap_area_in_bbox1_area_ratio(pdfium_line['bbox'].bbox, span['bbox']) > 0.5:
for pdfium_span in pdfium_line['spans']:
span['content'] += pdfium_span['text']
break
for span in vertical_spans:
if len(span['content']) == 0:
spans.remove(span)
"""水平的span框先用char填充,再用ocr填充空的span框"""
new_spans = []
for span in useful_spans + unuseful_spans:
if span['type'] in [ContentType.TEXT]:
span['chars'] = []
new_spans.append(span)
need_ocr_spans = fill_char_in_spans(new_spans, page_all_chars)
"""对未填充的span进行ocr"""
if len(need_ocr_spans) > 0:
for span in need_ocr_spans:
# 对span的bbox截图再ocr
span_pil_img = get_crop_img(span['bbox'], pil_img, scale)
span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR)
# 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
spans.remove(span)
continue
span['content'] = ''
span['score'] = 1.0
span['np_img'] = span_img
return spans
def fill_char_in_spans(spans, all_chars):
# 简单从上到下排一下序
spans = sorted(spans, key=lambda x: x['bbox'][1])
for char in all_chars:
for span in spans:
if calculate_char_in_span(char['bbox'], span['bbox'], char['char']):
span['chars'].append(char)
break
need_ocr_spans = []
for span in spans:
chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
need_ocr_spans.append(span)
del span['height'], span['width']
return need_ocr_spans
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
Span_Height_Radio = 0.33 # 字符的中轴和span的中轴高度差不能超过1/3span高度
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
span_height = span_bbox[3] - span_bbox[1]
if (
span_bbox[0] < char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
):
return True
else:
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
if char in LINE_STOP_FLAG:
if (
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
and char_center_x > span_bbox[0]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
elif char in LINE_START_FLAG:
if (
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
and char_center_x < span_bbox[2]
and span_bbox[1] < char_center_y < span_bbox[3]
and abs(char_center_y - span_center_y) < span_height * span_height_radio
):
return True
else:
return False
def chars_to_content(span):
# 检查span中的char是否为空
if len(span['chars']) == 0:
pass
else:
# 给chars按char_idx排序
span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
# Calculate the width of each character
char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
# Calculate the median width
median_width = statistics.median(char_widths)
content = ''
for char in span['chars']:
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1 = char
char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['char'] != ' ' and char2['char'] != ' ':
content += f"{char['char']} "
else:
content += char['char']
content = __replace_unicode(content)
content = __replace_ligatures(content)
content = __replace_ligatures(content)
span['content'] = content.strip()
del span['chars']
def calculate_contrast(img, img_mode) -> float:
"""
计算给定图像的对比度。
:param img: 图像,类型为numpy.ndarray
:Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
:return: 图像的对比度值
"""
if img_mode == 'rgb':
# 将RGB图像转换为灰度图
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
elif img_mode == 'bgr':
# 将BGR图像转换为灰度图
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
else:
raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
# 计算均值和标准差
mean_value = np.mean(gray_img)
std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6)
# logger.debug(f"contrast: {contrast}")
return round(contrast, 2)
\ No newline at end of file
__version__ = "2.0.0"
\ No newline at end of file
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
formats:
- epub
python:
install:
- requirements: next_docs/requirements.txt
sphinx:
configuration: next_docs/en/conf.py
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="224" height="72" viewBox="-29 -3.67 224 72" xml:space="preserve">
<desc>Created with Fabric.js 5.2.4</desc>
<defs>
</defs>
<rect x="0" y="0" width="100%" height="100%" fill="transparent"></rect>
<g transform="matrix(1 0 0 1 112 36)" id="7a867f58-a908-4f30-a839-fb725512b521" >
<rect style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(255,255,255); fill-rule: nonzero; opacity: 1; visibility: hidden;" vector-effect="non-scaling-stroke" x="-112" y="-36" rx="0" ry="0" width="224" height="72" />
</g>
<g transform="matrix(Infinity NaN NaN Infinity 0 0)" id="29611287-bf1c-4faf-8eb1-df32f6424829" >
</g>
<g transform="matrix(0.07 0 0 0.07 382.02 122.8)" id="60cdd44f-027a-437a-92c4-c8d44c60ef9e" >
<path style="stroke: rgb(0,0,0); stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(50,50,42); fill-rule: nonzero; opacity: 1;" vector-effect="non-scaling-stroke" transform=" translate(-64, -64)" d="M 57.62 61.68 C 55.919999999999995 61.92 54.75 63.46 55 65.11 C 55.1668510745875 66.32380621250819 56.039448735907676 67.32218371690155 57.22 67.65 C 57.22 67.65 64.69 70.11 77.4 71.16000000000001 C 87.61000000000001 72.01 99.2 70.43 99.2 70.43 C 100.9 70.39 102.23 68.98 102.19 67.28 C 102.17037752125772 66.4652516996782 101.82707564255573 65.69186585376654 101.23597809465886 65.13079230830253 C 100.644880546762 64.56971876283853 99.85466451370849 64.26716220997277 99.03999999999999 64.29 C 98.83999999999999 64.29 98.63999999999999 64.33000000000001 98.42999999999999 64.37 C 98.42999999999999 64.37 87.08999999999999 65.78 77.88 65.02000000000001 C 65.72999999999999 64.05000000000001 59.11 61.83000000000001 59.11 61.83000000000001 C 58.63 61.670000000000016 58.1 61.59000000000001 57.62 61.670000000000016 Z M 57.62 46.46 C 55.919999999999995 46.7 54.75 48.24 55 49.89 C 55.1668510745875 51.10380621250818 56.039448735907676 52.10218371690154 57.22 52.43 C 57.22 52.43 64.69 54.89 77.4 55.94 C 87.61000000000001 56.79 99.2 55.21 99.2 55.21 C 100.9 55.17 102.23 53.76 102.19 52.06 C 102.17037752125772 51.245251699678214 101.82707564255573 50.47186585376654 101.23597809465886 49.91079230830253 C 100.644880546762 49.34971876283853 99.85466451370849 49.047162209972754 99.03999999999999 49.07 C 98.83999999999999 49.07 98.63999999999999 49.11 98.42999999999999 49.15 C 98.42999999999999 49.15 87.08999999999999 50.559999999999995 77.88 49.8 C 65.72999999999999 48.83 59.11 46.61 59.11 46.61 C 58.63 46.45 58.1 46.37 57.62 46.45 Z M 57.62 31.240000000000002 C 55.919999999999995 31.48 54.75 33.02 55 34.67 C 55.1668510745875 35.88380621250818 56.039448735907676 36.882183716901544 57.22 37.21 C 57.22 37.21 64.69 39.67 77.4 40.72 C 87.61000000000001 41.57 99.2 39.99 99.2 39.99 C 100.9 39.95 102.23 38.54 102.19 36.84 C 102.17037752125772 36.025251699678215 101.82707564255573 35.25186585376654 101.23597809465886 34.690792308302534 C 100.644880546762 34.12971876283853 99.85466451370849 33.827162209972755 99.03999999999999 33.85 C 98.83999999999999 33.85 98.63999999999999 33.89 98.42999999999999 33.93 C 98.42999999999999 33.93 87.08999999999999 35.339999999999996 77.88 34.58 C 65.72999999999999 33.61 59.11 31.389999999999997 59.11 31.389999999999997 C 58.63 31.229999999999997 58.1 31.189999999999998 57.62 31.229999999999997 Z M 57.62 16.060000000000002 C 55.919999999999995 16.3 54.75 17.840000000000003 55 19.490000000000002 C 55.1668510745875 20.703806212508187 56.039448735907676 21.702183716901544 57.22 22.03 C 57.22 22.03 64.69 24.490000000000002 77.4 25.54 C 87.61000000000001 26.39 99.2 24.81 99.2 24.81 C 100.9 24.77 102.23 23.36 102.19 21.66 C 102.17037752125772 20.84525169967821 101.82707564255573 20.07186585376654 101.23597809465886 19.510792308302534 C 100.644880546762 18.949718762838526 99.8546645137085 18.64716220997276 99.03999999999999 18.67 C 98.83999999999999 18.67 98.63999999999999 18.71 98.42999999999999 18.75 C 98.42999999999999 18.75 87.08999999999999 20.16 77.88 19.4 C 65.72999999999999 18.43 59.11 16.209999999999997 59.11 16.209999999999997 C 58.637850878541954 16.01924514007714 58.12188500879498 15.963839409097599 57.62 16.049999999999997 Z M 36.31 0 C 20.32 0.12 14.39 5.05 14.39 5.05 L 14.39 124.42 C 14.39 124.42 20.2 119.41 38.93 120.18 C 57.66 120.95000000000002 61.5 127.53 84.50999999999999 127.97000000000001 C 107.52 128.41000000000003 113.28999999999999 124.42000000000002 113.28999999999999 124.42000000000002 L 113.60999999999999 2.750000000000014 C 113.60999999999999 2.750000000000014 103.28 5.7 83.09 5.86 C 62.95 6.01 58.11 0.73 39.62 0.12 C 38.49 0.04 37.4 0 36.31 0 Z M 49.67 7.79 C 49.67 7.79 59.36 10.98 77.24000000000001 11.870000000000001 C 92.38000000000001 12.64 107.52000000000001 10.38 107.52000000000001 10.38 L 107.52000000000001 118.53 C 107.52000000000001 118.53 99.85000000000001 122.57000000000001 80.68 121.19 C 65.82000000000001 120.14 49.480000000000004 114.49 49.480000000000004 114.49 L 49.68000000000001 7.799999999999997 Z M 40.35 10.620000000000001 C 42.050000000000004 10.620000000000001 43.46 11.990000000000002 43.46 13.73 C 43.46 15.469999999999999 42.09 16.84 40.35 16.84 C 40.35 16.84 35.34 16.88 32.28 17.16 C 27.150000000000002 17.68 23.64 19.54 23.64 19.54 C 22.150000000000002 20.349999999999998 20.25 19.74 19.48 18.25 C 18.67 16.76 19.28 14.86 20.77 14.09 C 22.259999999999998 13.32 25.33 11.67 31.67 11.06 C 35.34 10.66 40.35 10.620000000000001 40.35 10.620000000000001 Z M 37.36 25.880000000000003 C 39.06 25.840000000000003 40.35 25.880000000000003 40.35 25.880000000000003 C 42.050000000000004 26.080000000000002 43.260000000000005 27.62 43.050000000000004 29.310000000000002 C 42.88374644848126 30.726609090871516 41.76660909087151 31.843746448481262 40.35 32.010000000000005 C 40.35 32.010000000000005 35.34 32.050000000000004 32.28 32.330000000000005 C 27.150000000000002 32.85000000000001 23.64 34.71000000000001 23.64 34.71000000000001 C 22.150000000000002 35.52000000000001 20.25 34.91000000000001 19.48 33.42000000000001 C 18.67 31.93000000000001 19.28 30.03000000000001 20.77 29.26000000000001 C 20.77 29.26000000000001 25.33 26.84000000000001 31.67 26.230000000000008 C 33.53 25.99000000000001 35.67 25.910000000000007 37.36 25.870000000000008 Z M 40.35 41.06 C 42.050000000000004 41.06 43.46 42.43 43.46 44.17 C 43.46 45.910000000000004 42.09 47.28 40.35 47.28 C 40.35 47.28 35.34 47.24 32.28 47.56 C 27.150000000000002 48.080000000000005 23.64 49.940000000000005 23.64 49.940000000000005 C 22.150000000000002 50.75000000000001 20.25 50.14000000000001 19.48 48.650000000000006 C 18.67 47.160000000000004 19.28 45.260000000000005 20.77 44.49000000000001 C 20.77 44.49000000000001 25.33 42.07000000000001 31.67 41.46000000000001 C 35.34 41.02000000000001 40.35 41.06000000000001 40.35 41.06000000000001 Z" stroke-linecap="round" />
</g>
<g transform="matrix(0.07 0 0 0.07 396.05 123.14)" style="" id="eb0df536-c517-4781-a7c0-3f84cd77c272" >
<text xml:space="preserve" font-family="Lato" font-size="40" font-style="normal" font-weight="400" style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(0,0,0); fill-rule: nonzero; opacity: 1; white-space: pre;" ><tspan x="-130" y="12.57" >Read The Docs</tspan></text>
</g>
<g transform="matrix(0.28 0 0 0.28 27.88 36)" id="7b9eddb9-1652-4040-9437-2ab90652d624" >
<path style="stroke: rgb(0,0,0); stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(50,50,42); fill-rule: nonzero; opacity: 1;" vector-effect="non-scaling-stroke" transform=" translate(-64, -64)" d="M 57.62 61.68 C 55.919999999999995 61.92 54.75 63.46 55 65.11 C 55.1668510745875 66.32380621250819 56.039448735907676 67.32218371690155 57.22 67.65 C 57.22 67.65 64.69 70.11 77.4 71.16000000000001 C 87.61000000000001 72.01 99.2 70.43 99.2 70.43 C 100.9 70.39 102.23 68.98 102.19 67.28 C 102.17037752125772 66.4652516996782 101.82707564255573 65.69186585376654 101.23597809465886 65.13079230830253 C 100.644880546762 64.56971876283853 99.85466451370849 64.26716220997277 99.03999999999999 64.29 C 98.83999999999999 64.29 98.63999999999999 64.33000000000001 98.42999999999999 64.37 C 98.42999999999999 64.37 87.08999999999999 65.78 77.88 65.02000000000001 C 65.72999999999999 64.05000000000001 59.11 61.83000000000001 59.11 61.83000000000001 C 58.63 61.670000000000016 58.1 61.59000000000001 57.62 61.670000000000016 Z M 57.62 46.46 C 55.919999999999995 46.7 54.75 48.24 55 49.89 C 55.1668510745875 51.10380621250818 56.039448735907676 52.10218371690154 57.22 52.43 C 57.22 52.43 64.69 54.89 77.4 55.94 C 87.61000000000001 56.79 99.2 55.21 99.2 55.21 C 100.9 55.17 102.23 53.76 102.19 52.06 C 102.17037752125772 51.245251699678214 101.82707564255573 50.47186585376654 101.23597809465886 49.91079230830253 C 100.644880546762 49.34971876283853 99.85466451370849 49.047162209972754 99.03999999999999 49.07 C 98.83999999999999 49.07 98.63999999999999 49.11 98.42999999999999 49.15 C 98.42999999999999 49.15 87.08999999999999 50.559999999999995 77.88 49.8 C 65.72999999999999 48.83 59.11 46.61 59.11 46.61 C 58.63 46.45 58.1 46.37 57.62 46.45 Z M 57.62 31.240000000000002 C 55.919999999999995 31.48 54.75 33.02 55 34.67 C 55.1668510745875 35.88380621250818 56.039448735907676 36.882183716901544 57.22 37.21 C 57.22 37.21 64.69 39.67 77.4 40.72 C 87.61000000000001 41.57 99.2 39.99 99.2 39.99 C 100.9 39.95 102.23 38.54 102.19 36.84 C 102.17037752125772 36.025251699678215 101.82707564255573 35.25186585376654 101.23597809465886 34.690792308302534 C 100.644880546762 34.12971876283853 99.85466451370849 33.827162209972755 99.03999999999999 33.85 C 98.83999999999999 33.85 98.63999999999999 33.89 98.42999999999999 33.93 C 98.42999999999999 33.93 87.08999999999999 35.339999999999996 77.88 34.58 C 65.72999999999999 33.61 59.11 31.389999999999997 59.11 31.389999999999997 C 58.63 31.229999999999997 58.1 31.189999999999998 57.62 31.229999999999997 Z M 57.62 16.060000000000002 C 55.919999999999995 16.3 54.75 17.840000000000003 55 19.490000000000002 C 55.1668510745875 20.703806212508187 56.039448735907676 21.702183716901544 57.22 22.03 C 57.22 22.03 64.69 24.490000000000002 77.4 25.54 C 87.61000000000001 26.39 99.2 24.81 99.2 24.81 C 100.9 24.77 102.23 23.36 102.19 21.66 C 102.17037752125772 20.84525169967821 101.82707564255573 20.07186585376654 101.23597809465886 19.510792308302534 C 100.644880546762 18.949718762838526 99.8546645137085 18.64716220997276 99.03999999999999 18.67 C 98.83999999999999 18.67 98.63999999999999 18.71 98.42999999999999 18.75 C 98.42999999999999 18.75 87.08999999999999 20.16 77.88 19.4 C 65.72999999999999 18.43 59.11 16.209999999999997 59.11 16.209999999999997 C 58.637850878541954 16.01924514007714 58.12188500879498 15.963839409097599 57.62 16.049999999999997 Z M 36.31 0 C 20.32 0.12 14.39 5.05 14.39 5.05 L 14.39 124.42 C 14.39 124.42 20.2 119.41 38.93 120.18 C 57.66 120.95000000000002 61.5 127.53 84.50999999999999 127.97000000000001 C 107.52 128.41000000000003 113.28999999999999 124.42000000000002 113.28999999999999 124.42000000000002 L 113.60999999999999 2.750000000000014 C 113.60999999999999 2.750000000000014 103.28 5.7 83.09 5.86 C 62.95 6.01 58.11 0.73 39.62 0.12 C 38.49 0.04 37.4 0 36.31 0 Z M 49.67 7.79 C 49.67 7.79 59.36 10.98 77.24000000000001 11.870000000000001 C 92.38000000000001 12.64 107.52000000000001 10.38 107.52000000000001 10.38 L 107.52000000000001 118.53 C 107.52000000000001 118.53 99.85000000000001 122.57000000000001 80.68 121.19 C 65.82000000000001 120.14 49.480000000000004 114.49 49.480000000000004 114.49 L 49.68000000000001 7.799999999999997 Z M 40.35 10.620000000000001 C 42.050000000000004 10.620000000000001 43.46 11.990000000000002 43.46 13.73 C 43.46 15.469999999999999 42.09 16.84 40.35 16.84 C 40.35 16.84 35.34 16.88 32.28 17.16 C 27.150000000000002 17.68 23.64 19.54 23.64 19.54 C 22.150000000000002 20.349999999999998 20.25 19.74 19.48 18.25 C 18.67 16.76 19.28 14.86 20.77 14.09 C 22.259999999999998 13.32 25.33 11.67 31.67 11.06 C 35.34 10.66 40.35 10.620000000000001 40.35 10.620000000000001 Z M 37.36 25.880000000000003 C 39.06 25.840000000000003 40.35 25.880000000000003 40.35 25.880000000000003 C 42.050000000000004 26.080000000000002 43.260000000000005 27.62 43.050000000000004 29.310000000000002 C 42.88374644848126 30.726609090871516 41.76660909087151 31.843746448481262 40.35 32.010000000000005 C 40.35 32.010000000000005 35.34 32.050000000000004 32.28 32.330000000000005 C 27.150000000000002 32.85000000000001 23.64 34.71000000000001 23.64 34.71000000000001 C 22.150000000000002 35.52000000000001 20.25 34.91000000000001 19.48 33.42000000000001 C 18.67 31.93000000000001 19.28 30.03000000000001 20.77 29.26000000000001 C 20.77 29.26000000000001 25.33 26.84000000000001 31.67 26.230000000000008 C 33.53 25.99000000000001 35.67 25.910000000000007 37.36 25.870000000000008 Z M 40.35 41.06 C 42.050000000000004 41.06 43.46 42.43 43.46 44.17 C 43.46 45.910000000000004 42.09 47.28 40.35 47.28 C 40.35 47.28 35.34 47.24 32.28 47.56 C 27.150000000000002 48.080000000000005 23.64 49.940000000000005 23.64 49.940000000000005 C 22.150000000000002 50.75000000000001 20.25 50.14000000000001 19.48 48.650000000000006 C 18.67 47.160000000000004 19.28 45.260000000000005 20.77 44.49000000000001 C 20.77 44.49000000000001 25.33 42.07000000000001 31.67 41.46000000000001 C 35.34 41.02000000000001 40.35 41.06000000000001 40.35 41.06000000000001 Z" stroke-linecap="round" />
</g>
<g transform="matrix(0.9 0 0 0.9 94 36)" style="" id="385bde16-f9fa-4222-bfea-1d5d5efcf730" >
<text xml:space="preserve" font-family="Lato" font-size="15" font-style="normal" font-weight="100" style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(0,0,0); fill-rule: nonzero; opacity: 1; white-space: pre;" ><tspan x="-48.68" y="4.71" >Read The Docs</tspan></text>
</g>
</svg>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment