Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
08f46125
Commit
08f46125
authored
Nov 15, 2024
by
myhloli
Browse files
refactor(model): rename and restructure model modules
parent
918ed65b
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
34 additions
and
16 deletions
+34
-16
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py
...model/sub_modules/reading_oreder/layoutreader/__init__.py
+0
-0
magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py
.../model/sub_modules/reading_oreder/layoutreader/helpers.py
+0
-0
magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py
...df/model/sub_modules/reading_oreder/layoutreader/xycut.py
+0
-0
magic_pdf/model/sub_modules/table/__init__.py
magic_pdf/model/sub_modules/table/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
+14
-0
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
...f/model/sub_modules/table/structeqtable/struct_eqtable.py
+3
-11
magic_pdf/model/sub_modules/table/table_utils.py
magic_pdf/model/sub_modules/table/table_utils.py
+11
-0
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
+0
-0
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
...model/sub_modules/table/tablemaster/tablemaster_paddle.py
+1
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+3
-3
setup.py
setup.py
+1
-0
tests/test_table/test_tablemaster.py
tests/test_table/test_tablemaster.py
+1
-1
No files found.
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py
0 → 100644
View file @
08f46125
magic_pdf/model/
v3
/helpers.py
→
magic_pdf/model/
sub_modules/reading_oreder/layoutreader
/helpers.py
View file @
08f46125
File moved
magic_pdf/model/
v3
/xycut.py
→
magic_pdf/model/
sub_modules/reading_oreder/layoutreader
/xycut.py
View file @
08f46125
File moved
magic_pdf/model/sub_modules/table/__init__.py
0 → 100644
View file @
08f46125
magic_pdf/model/sub_modules/table/rapidtable/__init__.py
0 → 100644
View file @
08f46125
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
0 → 100644
View file @
08f46125
import
numpy
as
np
from
rapid_table
import
RapidTable
from
rapidocr_paddle
import
RapidOCR
class
RapidTableModel
(
object
):
def
__init__
(
self
):
self
.
table_model
=
RapidTable
()
self
.
ocr_engine
=
RapidOCR
(
det_use_cuda
=
True
,
cls_use_cuda
=
True
,
rec_use_cuda
=
True
)
def
predict
(
self
,
image
):
ocr_result
,
_
=
self
.
ocr_engine
(
np
.
asarray
(
image
))
html_code
,
table_cell_bboxes
,
elapse
=
self
.
table_model
(
np
.
asarray
(
image
),
ocr_result
)
return
html_code
,
table_cell_bboxes
,
elapse
\ No newline at end of file
magic_pdf/model/sub_modules/table/structeqtable/__init__.py
0 → 100644
View file @
08f46125
magic_pdf/model/
pek_
sub_modules/structeqtable/
S
truct
TableModel
.py
→
magic_pdf/model/sub_modules/
table/
structeqtable/
s
truct
_eqtable
.py
View file @
08f46125
import
re
import
torch
import
torch
from
struct_eqtable
import
build_model
from
struct_eqtable
import
build_model
from
magic_pdf.model.sub_modules.table.table_utils
import
minify_html
class
StructTableModel
:
class
StructTableModel
:
def
__init__
(
self
,
model_path
,
max_new_tokens
=
1024
,
max_time
=
60
):
def
__init__
(
self
,
model_path
,
max_new_tokens
=
1024
,
max_time
=
60
):
...
@@ -31,15 +31,7 @@ class StructTableModel:
...
@@ -31,15 +31,7 @@ class StructTableModel:
)
)
if
output_format
==
"html"
:
if
output_format
==
"html"
:
results
=
[
self
.
minify_html
(
html
)
for
html
in
results
]
results
=
[
minify_html
(
html
)
for
html
in
results
]
return
results
return
results
def
minify_html
(
self
,
html
):
# 移除多余的空白字符
html
=
re
.
sub
(
r
'\s+'
,
' '
,
html
)
# 移除行尾的空白字符
html
=
re
.
sub
(
r
'\s*>\s*'
,
'>'
,
html
)
# 移除标签前的空白字符
html
=
re
.
sub
(
r
'\s*<\s*'
,
'<'
,
html
)
return
html
.
strip
()
\ No newline at end of file
magic_pdf/model/sub_modules/table/table_utils.py
0 → 100644
View file @
08f46125
import
re
def
minify_html
(
html
):
# 移除多余的空白字符
html
=
re
.
sub
(
r
'\s+'
,
' '
,
html
)
# 移除行尾的空白字符
html
=
re
.
sub
(
r
'\s*>\s*'
,
'>'
,
html
)
# 移除标签前的空白字符
html
=
re
.
sub
(
r
'\s*<\s*'
,
'<'
,
html
)
return
html
.
strip
()
\ No newline at end of file
magic_pdf/model/sub_modules/table/tablemaster/__init__.py
0 → 100644
View file @
08f46125
magic_pdf/model/
ppTableModel
.py
→
magic_pdf/model/
sub_modules/table/tablemaster/tablemaster_paddle
.py
View file @
08f46125
...
@@ -7,7 +7,7 @@ from PIL import Image
...
@@ -7,7 +7,7 @@ from PIL import Image
import
numpy
as
np
import
numpy
as
np
class
pp
TableModel
(
object
):
class
TableM
asterPaddleM
odel
(
object
):
"""
"""
This class is responsible for converting image of table into HTML format using a pre-trained model.
This class is responsible for converting image of table into HTML format using a pre-trained model.
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
08f46125
...
@@ -164,8 +164,8 @@ class ModelSingleton:
...
@@ -164,8 +164,8 @@ class ModelSingleton:
def
do_predict
(
boxes
:
List
[
List
[
int
]],
model
)
->
List
[
int
]:
def
do_predict
(
boxes
:
List
[
List
[
int
]],
model
)
->
List
[
int
]:
from
magic_pdf.model.
v3
.helpers
import
(
boxes2inputs
,
parse_logits
,
from
magic_pdf.model.
sub_modules.reading_oreder.layoutreader
.helpers
import
(
boxes2inputs
,
parse_logits
,
prepare_inputs
)
prepare_inputs
)
inputs
=
boxes2inputs
(
boxes
)
inputs
=
boxes2inputs
(
boxes
)
inputs
=
prepare_inputs
(
inputs
,
model
)
inputs
=
prepare_inputs
(
inputs
,
model
)
...
@@ -206,7 +206,7 @@ def cal_block_index(fix_blocks, sorted_bboxes):
...
@@ -206,7 +206,7 @@ def cal_block_index(fix_blocks, sorted_bboxes):
del
block
[
'real_lines'
]
del
block
[
'real_lines'
]
import
numpy
as
np
import
numpy
as
np
from
magic_pdf.model.
v3
.xycut
import
recursive_xy_cut
from
magic_pdf.model.
sub_modules.reading_oreder.layoutreader
.xycut
import
recursive_xy_cut
random_boxes
=
np
.
array
(
block_bboxes
)
random_boxes
=
np
.
array
(
block_bboxes
)
np
.
random
.
shuffle
(
random_boxes
)
np
.
random
.
shuffle
(
random_boxes
)
...
...
setup.py
View file @
08f46125
...
@@ -49,6 +49,7 @@ if __name__ == '__main__':
...
@@ -49,6 +49,7 @@ if __name__ == '__main__':
"doclayout_yolo==0.0.2"
,
# doclayout_yolo
"doclayout_yolo==0.0.2"
,
# doclayout_yolo
"rapidocr-paddle"
,
# rapidocr-paddle
"rapidocr-paddle"
,
# rapidocr-paddle
"rapid_table"
,
# rapid_table
"rapid_table"
,
# rapid_table
"PyYAML"
,
# yaml
"detectron2"
"detectron2"
],
],
},
},
...
...
tests/test_table/test_tablemaster.py
View file @
08f46125
...
@@ -2,7 +2,7 @@ import unittest
...
@@ -2,7 +2,7 @@ import unittest
from
PIL
import
Image
from
PIL
import
Image
from
lxml
import
etree
from
lxml
import
etree
from
magic_pdf.model.
ppTableModel
import
ppTableModel
from
magic_pdf.model.
sub_modules.table.tablemaster.tablemaster_paddle
import
ppTableModel
class
TestppTableModel
(
unittest
.
TestCase
):
class
TestppTableModel
(
unittest
.
TestCase
):
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment