Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7f2f2c0f
Commit
7f2f2c0f
authored
Nov 29, 2024
by
myhloli
Browse files
refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment
parent
d4345b6e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
9 deletions
+57
-9
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+47
-1
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+8
-6
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
...model/sub_modules/table/tablemaster/tablemaster_paddle.py
+2
-2
No files found.
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
View file @
7f2f2c0f
import
cv2
import
numpy
as
np
from
loguru
import
logger
from
io
import
BytesIO
from
PIL
import
Image
import
base64
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
from
ppocr.utils.utility
import
check_and_read
def
img_decode
(
content
:
bytes
):
np_arr
=
np
.
frombuffer
(
content
,
dtype
=
np
.
uint8
)
return
cv2
.
imdecode
(
np_arr
,
cv2
.
IMREAD_UNCHANGED
)
def
check_img
(
img
):
if
isinstance
(
img
,
bytes
):
img
=
img_decode
(
img
)
if
isinstance
(
img
,
str
):
image_file
=
img
img
,
flag_gif
,
flag_pdf
=
check_and_read
(
image_file
)
if
not
flag_gif
and
not
flag_pdf
:
with
open
(
image_file
,
'rb'
)
as
f
:
img_str
=
f
.
read
()
img
=
img_decode
(
img_str
)
if
img
is
None
:
try
:
buf
=
BytesIO
()
image
=
BytesIO
(
img_str
)
im
=
Image
.
open
(
image
)
rgb
=
im
.
convert
(
'RGB'
)
rgb
.
save
(
buf
,
'jpeg'
)
buf
.
seek
(
0
)
image_bytes
=
buf
.
read
()
data_base64
=
str
(
base64
.
b64encode
(
image_bytes
),
encoding
=
"utf-8"
)
image_decode
=
base64
.
b64decode
(
data_base64
)
img_array
=
np
.
frombuffer
(
image_decode
,
np
.
uint8
)
img
=
cv2
.
imdecode
(
img_array
,
cv2
.
IMREAD_COLOR
)
except
:
logger
.
error
(
"error in loading image:{}"
.
format
(
image_file
))
return
None
if
img
is
None
:
logger
.
error
(
"error in loading image:{}"
.
format
(
image_file
))
return
None
if
isinstance
(
img
,
np
.
ndarray
)
and
len
(
img
.
shape
)
==
2
:
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_GRAY2BGR
)
return
img
def
bbox_to_points
(
bbox
):
""" 将bbox格式转换为四个顶点的数组 """
...
...
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
View file @
7f2f2c0f
import
copy
import
time
import
cv2
import
numpy
as
np
from
paddleocr
import
PaddleOCR
from
paddleocr.paddleocr
import
check_img
,
logger
from
paddleocr.ppocr.utils.utility
import
alpha_to_color
,
binarize_img
from
paddleocr.tools.infer.predict_system
import
sorted_boxes
from
paddleocr.tools.infer.utility
import
get_rotate_crop_image
,
get_minarea_rect_crop
from
ppocr.utils.logging
import
get_logger
from
ppocr.utils.utility
import
alpha_to_color
,
binarize_img
from
tools.infer.predict_system
import
sorted_boxes
from
tools.infer.utility
import
get_rotate_crop_image
,
get_minarea_rect_crop
from
magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils
import
update_det_boxes
,
merge_det_boxes
,
check_img
from
magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils
import
update_det_boxes
,
merge_det_boxes
logger
=
get_logger
()
class
ModifiedPaddleOCR
(
PaddleOCR
):
...
...
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
View file @
7f2f2c0f
...
...
@@ -2,8 +2,8 @@ import os
import
cv2
import
numpy
as
np
from
paddleocr.
ppstructure.table.predict_table
import
TableSystem
from
paddleocr.
ppstructure.utility
import
init_args
from
ppstructure.table.predict_table
import
TableSystem
from
ppstructure.utility
import
init_args
from
PIL
import
Image
from
magic_pdf.config.constants
import
*
# noqa: F403
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment