Commit 3854bd0f authored by Li Xia's avatar Li Xia
Browse files

Fix: 表格内容中的HTML Entity会导致表格内容错乱 [#2694]

parent 45a282fa
import os import os
import html
import cv2 import cv2
import numpy as np import numpy as np
from loguru import logger from loguru import logger
...@@ -8,6 +9,11 @@ from mineru.utils.enum_class import ModelPath ...@@ -8,6 +9,11 @@ from mineru.utils.enum_class import ModelPath
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
def escape_html(input_string):
"""Escape HTML Entities."""
return html.escape(input_string)
class RapidTableModel(object): class RapidTableModel(object):
def __init__(self, ocr_engine): def __init__(self, ocr_engine):
slanet_plus_model_path = os.path.join(auto_download_and_get_model_root_path(ModelPath.slanet_plus), ModelPath.slanet_plus) slanet_plus_model_path = os.path.join(auto_download_and_get_model_root_path(ModelPath.slanet_plus), ModelPath.slanet_plus)
...@@ -63,7 +69,7 @@ class RapidTableModel(object): ...@@ -63,7 +69,7 @@ class RapidTableModel(object):
# Continue with OCR on potentially rotated image # Continue with OCR on potentially rotated image
ocr_result = self.ocr_engine.ocr(bgr_image)[0] ocr_result = self.ocr_engine.ocr(bgr_image)[0]
if ocr_result: if ocr_result:
ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if ocr_result = [[item[0], escape_html(item[1][0]), item[1][1]] for item in ocr_result if
len(item) == 2 and isinstance(item[1], tuple)] len(item) == 2 and isinstance(item[1], tuple)]
else: else:
ocr_result = None ocr_result = None
......
...@@ -319,6 +319,14 @@ ...@@ -319,6 +319,14 @@
"created_at": "2025-06-17T03:09:54Z", "created_at": "2025-06-17T03:09:54Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 2676 "pullRequestNo": 2676
},
{
"name": "hsia",
"id": 654127,
"comment_id": 2979415817,
"created_at": "2025-06-17T17:35:10Z",
"repoId": 765083837,
"pullRequestNo": 2699
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment