Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
bed386f7
Unverified
Commit
bed386f7
authored
Nov 15, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 15, 2024
Browse files
Merge pull request #964 from myhloli/dev
refactor(model): rename and restructure model modules
parents
8ddbe8bb
c064379c
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
35 additions
and
17 deletions
+35
-17
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py
...model/sub_modules/reading_oreder/layoutreader/__init__.py
+0
-0
magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py
.../model/sub_modules/reading_oreder/layoutreader/helpers.py
+0
-0
magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py
...df/model/sub_modules/reading_oreder/layoutreader/xycut.py
+0
-0
magic_pdf/model/sub_modules/table/__init__.py
magic_pdf/model/sub_modules/table/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
+14
-0
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
...f/model/sub_modules/table/structeqtable/struct_eqtable.py
+3
-11
magic_pdf/model/sub_modules/table/table_utils.py
magic_pdf/model/sub_modules/table/table_utils.py
+11
-0
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
...model/sub_modules/table/tablemaster/tablemaster_paddle.py
+1
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+3
-3
setup.py
setup.py
+1
-0
tests/test_table/test_tablemaster.py
tests/test_table/test_tablemaster.py
+2
-2
No files found.
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py
0 → 100644
View file @
bed386f7
magic_pdf/model/
v3
/helpers.py
→
magic_pdf/model/
sub_modules/reading_oreder/layoutreader
/helpers.py
View file @
bed386f7
File moved
magic_pdf/model/
v3
/xycut.py
→
magic_pdf/model/
sub_modules/reading_oreder/layoutreader
/xycut.py
View file @
bed386f7
File moved
magic_pdf/model/sub_modules/table/__init__.py
0 → 100644
View file @
bed386f7
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
0 → 100644
View file @
bed386f7
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
0 → 100644
View file @
bed386f7
import
numpy
as
np
from
rapid_table
import
RapidTable
from
rapidocr_paddle
import
RapidOCR
class
RapidTableModel
(
object
):
def
__init__
(
self
):
self
.
table_model
=
RapidTable
()
self
.
ocr_engine
=
RapidOCR
(
det_use_cuda
=
True
,
cls_use_cuda
=
True
,
rec_use_cuda
=
True
)
def
predict
(
self
,
image
):
ocr_result
,
_
=
self
.
ocr_engine
(
np
.
asarray
(
image
))
html_code
,
table_cell_bboxes
,
elapse
=
self
.
table_model
(
np
.
asarray
(
image
),
ocr_result
)
return
html_code
,
table_cell_bboxes
,
elapse
\ No newline at end of file
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
0 → 100644
View file @
bed386f7
magic_pdf/model/
pek_
sub_modules/structeqtable/
S
truct
TableModel
.py
→
magic_pdf/model/sub_modules/
table/
structeqtable/
s
truct
_eqtable
.py
View file @
bed386f7
import
re
import
torch
from
struct_eqtable
import
build_model
from
magic_pdf.model.sub_modules.table.table_utils
import
minify_html
class
StructTableModel
:
def
__init__
(
self
,
model_path
,
max_new_tokens
=
1024
,
max_time
=
60
):
...
...
@@ -31,15 +31,7 @@ class StructTableModel:
)
if
output_format
==
"html"
:
results
=
[
self
.
minify_html
(
html
)
for
html
in
results
]
results
=
[
minify_html
(
html
)
for
html
in
results
]
return
results
def
minify_html
(
self
,
html
):
# 移除多余的空白字符
html
=
re
.
sub
(
r
'\s+'
,
' '
,
html
)
# 移除行尾的空白字符
html
=
re
.
sub
(
r
'\s*>\s*'
,
'>'
,
html
)
# 移除标签前的空白字符
html
=
re
.
sub
(
r
'\s*<\s*'
,
'<'
,
html
)
return
html
.
strip
()
\ No newline at end of file
magic_pdf/model/sub_modules/table/table_utils.py
0 → 100644
View file @
bed386f7
import
re
def
minify_html
(
html
):
# 移除多余的空白字符
html
=
re
.
sub
(
r
'\s+'
,
' '
,
html
)
# 移除行尾的空白字符
html
=
re
.
sub
(
r
'\s*>\s*'
,
'>'
,
html
)
# 移除标签前的空白字符
html
=
re
.
sub
(
r
'\s*<\s*'
,
'<'
,
html
)
return
html
.
strip
()
\ No newline at end of file
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
0 → 100644
View file @
bed386f7
magic_pdf/model/
ppTableModel
.py
→
magic_pdf/model/
sub_modules/table/tablemaster/tablemaster_paddle
.py
View file @
bed386f7
...
...
@@ -7,7 +7,7 @@ from PIL import Image
import
numpy
as
np
class
pp
TableModel
(
object
):
class
TableM
asterPaddleM
odel
(
object
):
"""
This class is responsible for converting image of table into HTML format using a pre-trained model.
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
bed386f7
...
...
@@ -164,8 +164,8 @@ class ModelSingleton:
def
do_predict
(
boxes
:
List
[
List
[
int
]],
model
)
->
List
[
int
]:
from
magic_pdf.model.
v3
.helpers
import
(
boxes2inputs
,
parse_logits
,
prepare_inputs
)
from
magic_pdf.model.
sub_modules.reading_oreder.layoutreader
.helpers
import
(
boxes2inputs
,
parse_logits
,
prepare_inputs
)
inputs
=
boxes2inputs
(
boxes
)
inputs
=
prepare_inputs
(
inputs
,
model
)
...
...
@@ -206,7 +206,7 @@ def cal_block_index(fix_blocks, sorted_bboxes):
del
block
[
'real_lines'
]
import
numpy
as
np
from
magic_pdf.model.
v3
.xycut
import
recursive_xy_cut
from
magic_pdf.model.
sub_modules.reading_oreder.layoutreader
.xycut
import
recursive_xy_cut
random_boxes
=
np
.
array
(
block_bboxes
)
np
.
random
.
shuffle
(
random_boxes
)
...
...
setup.py
View file @
bed386f7
...
...
@@ -49,6 +49,7 @@ if __name__ == '__main__':
"doclayout_yolo==0.0.2"
,
# doclayout_yolo
"rapidocr-paddle"
,
# rapidocr-paddle
"rapid_table"
,
# rapid_table
"PyYAML"
,
# yaml
"detectron2"
],
},
...
...
tests/test_table/test_tablemaster.py
View file @
bed386f7
...
...
@@ -2,7 +2,7 @@ import unittest
from
PIL
import
Image
from
lxml
import
etree
from
magic_pdf.model.
ppTableModel
import
pp
TableModel
from
magic_pdf.model.
sub_modules.table.tablemaster.tablemaster_paddle
import
TableM
asterPaddleM
odel
class
TestppTableModel
(
unittest
.
TestCase
):
...
...
@@ -11,7 +11,7 @@ class TestppTableModel(unittest.TestCase):
# 修改table模型路径
config
=
{
"device"
:
"cuda"
,
"model_dir"
:
"/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"
}
table_model
=
pp
TableModel
(
config
)
table_model
=
TableM
asterPaddleM
odel
(
config
)
res
=
table_model
.
img2html
(
img
)
# 验证生成的 HTML 是否符合预期
parser
=
etree
.
HTMLParser
()
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment