Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7f0371da
Commit
7f0371da
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
11e4e8cc
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
12 additions
and
12 deletions
+12
-12
othoers/check_inline_formula.py
othoers/check_inline_formula.py
+0
-0
othoers/pdf2json_infer.py
othoers/pdf2json_infer.py
+2
-2
othoers/pdf2text_evaluatePdfLayout.py
othoers/pdf2text_evaluatePdfLayout.py
+0
-0
othoers/pdf2text_getNumberOfColumn.py
othoers/pdf2text_getNumberOfColumn.py
+0
-0
othoers/pdf2text_recogFootnoteLine.py
othoers/pdf2text_recogFootnoteLine.py
+0
-0
othoers/pdf2text_recogPara_v2.py
othoers/pdf2text_recogPara_v2.py
+1
-1
othoers/pdf2text_recogTitle.py
othoers/pdf2text_recogTitle.py
+0
-0
othoers/vali_bbox_sort.py
othoers/vali_bbox_sort.py
+0
-0
othoers/validation.py
othoers/validation.py
+0
-0
pipeline/pdf_parse_by_model.py
pipeline/pdf_parse_by_model.py
+4
-4
post_proc/detect_para.py
post_proc/detect_para.py
+1
-1
pre_proc/detect_header.py
pre_proc/detect_header.py
+0
-0
pre_proc/detect_page_number.py
pre_proc/detect_page_number.py
+0
-0
pre_proc/detect_tables.py
pre_proc/detect_tables.py
+0
-0
test/test_para/test_pdf2text_recogPara_BlockContinuationProcessor.py
...ara/test_pdf2text_recogPara_BlockContinuationProcessor.py
+1
-1
test/test_para/test_pdf2text_recogPara_BlockInnerParasProcessor.py
..._para/test_pdf2text_recogPara_BlockInnerParasProcessor.py
+1
-1
test/test_para/test_pdf2text_recogPara_Common.py
test/test_para/test_pdf2text_recogPara_Common.py
+1
-1
test/test_para/test_pdf2text_recogPara_TitleProcessor.py
test/test_para/test_pdf2text_recogPara_TitleProcessor.py
+1
-1
No files found.
check_inline_formula.py
→
othoers/
check_inline_formula.py
View file @
7f0371da
File moved
pdf2json_infer.py
→
othoers/
pdf2json_infer.py
View file @
7f0371da
...
...
@@ -24,10 +24,10 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
from
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
pre_proc.detect_images
import
parse_images
# 获取figures的bbox
from
p
df2
te
x
t_
recogT
able
import
parse_tables
# 获取tables的bbox
from
p
re_proc.de
te
c
t_
t
able
s
import
parse_tables
# 获取tables的bbox
from
pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
p
df2
te
x
t_
recogP
ara
import
process_blocks_per_page
from
p
ost_proc.de
te
c
t_
p
ara
import
process_blocks_per_page
from
libs.commons
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
...
...
pdf2text_evaluatePdfLayout.py
→
othoers/
pdf2text_evaluatePdfLayout.py
View file @
7f0371da
File moved
pdf2text_getNumberOfColumn.py
→
othoers/
pdf2text_getNumberOfColumn.py
View file @
7f0371da
File moved
pdf2text_recogFootnoteLine.py
→
othoers/
pdf2text_recogFootnoteLine.py
View file @
7f0371da
File moved
pdf2text_recogPara_v2.py
→
othoers/
pdf2text_recogPara_v2.py
View file @
7f0371da
...
...
@@ -3469,7 +3469,7 @@ class ParaProcessPipeline:
"""
Run this script to test the function with Command:
python
pdf2
te
x
t_
recogP
ara.py [pdf_path] [output_pdf_path]
python
de
te
c
t_
p
ara.py [pdf_path] [output_pdf_path]
Params:
- pdf_path: the path of the pdf file
...
...
pdf2text_recogTitle.py
→
othoers/
pdf2text_recogTitle.py
View file @
7f0371da
File moved
vali_bbox_sort.py
→
othoers/
vali_bbox_sort.py
View file @
7f0371da
File moved
validation.py
→
othoers/
validation.py
View file @
7f0371da
File moved
pipeline/pdf_parse_by_model.py
View file @
7f0371da
...
...
@@ -16,14 +16,14 @@ from libs.markdown_utils import escape_special_markdown_char
from
libs.safe_filename
import
sanitize_filename
from
libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
pre_proc.detect_images
import
parse_images
from
p
df2
te
x
t_
recogT
able
import
parse_tables
# 获取tables的bbox
from
p
re_proc.de
te
c
t_
t
able
s
import
parse_tables
# 获取tables的bbox
from
pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
p
df2text_recogH
eader
import
parse_headers
# 获取headers的bbox
from
p
df2
te
x
t_
recogPageNo
import
parse_pageNos
# 获取pageNos的bbox
from
p
re_proc.detect_h
eader
import
parse_headers
# 获取headers的bbox
from
p
re_proc.de
te
c
t_
page_number
import
parse_pageNos
# 获取pageNos的bbox
from
pre_proc.detect_footnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
pre_proc.detect_footer_by_model
import
parse_footers
# 获取footers的bbox
from
p
df2
te
x
t_
recogP
ara
import
(
from
p
ost_proc.de
te
c
t_
p
ara
import
(
ParaProcessPipeline
,
TitleDetectionException
,
TitleLevelException
,
...
...
p
df2
te
x
t_
recogP
ara.py
→
p
ost_proc/de
te
c
t_
p
ara.py
View file @
7f0371da
...
...
@@ -3395,7 +3395,7 @@ class ParaProcessPipeline:
"""
Run this script to test the function with Command:
python
pdf2
te
x
t_
recogP
ara.py [pdf_path] [output_pdf_path]
python
de
te
c
t_
p
ara.py [pdf_path] [output_pdf_path]
Params:
- pdf_path: the path of the pdf file
...
...
p
df2text_recogH
eader.py
→
p
re_proc/detect_h
eader.py
View file @
7f0371da
File moved
p
df2
te
x
t_
recogPageNo
.py
→
p
re_proc/de
te
c
t_
page_number
.py
View file @
7f0371da
File moved
p
df2
te
x
t_
recogT
able.py
→
p
re_proc/de
te
c
t_
t
able
s
.py
View file @
7f0371da
File moved
test/test_para/test_pdf2text_recogPara_BlockContinuationProcessor.py
View file @
7f0371da
import
unittest
from
p
df2
te
x
t_
recogP
ara
import
BlockContinuationProcessor
from
p
ost_proc.de
te
c
t_
p
ara
import
BlockContinuationProcessor
# from ... pdf2text_recogPara import BlockContinuationProcessor # another way to import
...
...
test/test_para/test_pdf2text_recogPara_BlockInnerParasProcessor.py
View file @
7f0371da
import
unittest
from
p
df2
te
x
t_
recogP
ara
import
BlockTerminationProcessor
from
p
ost_proc.de
te
c
t_
p
ara
import
BlockTerminationProcessor
# from ... pdf2text_recogPara import BlockInnerParasProcessor # another way to import
...
...
test/test_para/test_pdf2text_recogPara_Common.py
View file @
7f0371da
import
unittest
from
p
df2
te
x
t_
recogP
ara
import
(
from
p
ost_proc.de
te
c
t_
p
ara
import
(
is_bbox_overlap
,
is_in_bbox
,
is_line_right_aligned_from_neighbors
,
...
...
test/test_para/test_pdf2text_recogPara_TitleProcessor.py
View file @
7f0371da
...
...
@@ -2,7 +2,7 @@ import json
import
unittest
from
utils_for_test_para
import
UtilsForTestPara
from
p
df2
te
x
t_
recogP
ara
import
TitleProcessor
from
p
ost_proc.de
te
c
t_
p
ara
import
TitleProcessor
# from ... pdf2text_recogPara import * # another way to import
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment