Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7f2f2c0f
Commit
7f2f2c0f
authored
Nov 29, 2024
by
myhloli
Browse files
refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment
parent
d4345b6e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
57 additions
and
9 deletions
+57
-9
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+47
-1
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+8
-6
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
...model/sub_modules/table/tablemaster/tablemaster_paddle.py
+2
-2
No files found.
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
View file @
7f2f2c0f
import
cv2
import
numpy
as
np
import
numpy
as
np
from
loguru
import
logger
from
loguru
import
logger
from
io
import
BytesIO
from
PIL
import
Image
import
base64
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
from
magic_pdf.pre_proc.ocr_dict_merge
import
merge_spans_to_line
from
ppocr.utils.utility
import
check_and_read
def
img_decode
(
content
:
bytes
):
np_arr
=
np
.
frombuffer
(
content
,
dtype
=
np
.
uint8
)
return
cv2
.
imdecode
(
np_arr
,
cv2
.
IMREAD_UNCHANGED
)
def
check_img
(
img
):
if
isinstance
(
img
,
bytes
):
img
=
img_decode
(
img
)
if
isinstance
(
img
,
str
):
image_file
=
img
img
,
flag_gif
,
flag_pdf
=
check_and_read
(
image_file
)
if
not
flag_gif
and
not
flag_pdf
:
with
open
(
image_file
,
'rb'
)
as
f
:
img_str
=
f
.
read
()
img
=
img_decode
(
img_str
)
if
img
is
None
:
try
:
buf
=
BytesIO
()
image
=
BytesIO
(
img_str
)
im
=
Image
.
open
(
image
)
rgb
=
im
.
convert
(
'RGB'
)
rgb
.
save
(
buf
,
'jpeg'
)
buf
.
seek
(
0
)
image_bytes
=
buf
.
read
()
data_base64
=
str
(
base64
.
b64encode
(
image_bytes
),
encoding
=
"utf-8"
)
image_decode
=
base64
.
b64decode
(
data_base64
)
img_array
=
np
.
frombuffer
(
image_decode
,
np
.
uint8
)
img
=
cv2
.
imdecode
(
img_array
,
cv2
.
IMREAD_COLOR
)
except
:
logger
.
error
(
"error in loading image:{}"
.
format
(
image_file
))
return
None
if
img
is
None
:
logger
.
error
(
"error in loading image:{}"
.
format
(
image_file
))
return
None
if
isinstance
(
img
,
np
.
ndarray
)
and
len
(
img
.
shape
)
==
2
:
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_GRAY2BGR
)
return
img
def
bbox_to_points
(
bbox
):
def
bbox_to_points
(
bbox
):
""" 将bbox格式转换为四个顶点的数组 """
""" 将bbox格式转换为四个顶点的数组 """
...
...
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
View file @
7f2f2c0f
import
copy
import
copy
import
time
import
time
import
cv2
import
cv2
import
numpy
as
np
import
numpy
as
np
from
paddleocr
import
PaddleOCR
from
paddleocr
import
PaddleOCR
from
paddleocr.paddleocr
import
check_img
,
logger
from
ppocr.utils.logging
import
get_logger
from
paddleocr.ppocr.utils.utility
import
alpha_to_color
,
binarize_img
from
ppocr.utils.utility
import
alpha_to_color
,
binarize_img
from
paddleocr.tools.infer.predict_system
import
sorted_boxes
from
tools.infer.predict_system
import
sorted_boxes
from
paddleocr.tools.infer.utility
import
get_rotate_crop_image
,
get_minarea_rect_crop
from
tools.infer.utility
import
get_rotate_crop_image
,
get_minarea_rect_crop
from
magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils
import
update_det_boxes
,
merge_det_boxes
,
check_img
from
magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils
import
update_det_boxes
,
merge_det_boxes
logger
=
get_logger
()
class
ModifiedPaddleOCR
(
PaddleOCR
):
class
ModifiedPaddleOCR
(
PaddleOCR
):
...
...
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
View file @
7f2f2c0f
...
@@ -2,8 +2,8 @@ import os
...
@@ -2,8 +2,8 @@ import os
import
cv2
import
cv2
import
numpy
as
np
import
numpy
as
np
from
paddleocr.
ppstructure.table.predict_table
import
TableSystem
from
ppstructure.table.predict_table
import
TableSystem
from
paddleocr.
ppstructure.utility
import
init_args
from
ppstructure.utility
import
init_args
from
PIL
import
Image
from
PIL
import
Image
from
magic_pdf.config.constants
import
*
# noqa: F403
from
magic_pdf.config.constants
import
*
# noqa: F403
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment