Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
8fb6794b
Unverified
Commit
8fb6794b
authored
Apr 17, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Apr 17, 2025
Browse files
Merge pull request #2265 from opendatalab/release-1.3.5
Release 1.3.5
parents
a2b07bfd
af53a463
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
24 deletions
+16
-24
docker/ascend_npu/Dockerfile
docker/ascend_npu/Dockerfile
+1
-1
magic_pdf/utils/office_to_pdf.py
magic_pdf/utils/office_to_pdf.py
+13
-22
tests/unittest/test_table/test_rapidtable.py
tests/unittest/test_table/test_rapidtable.py
+2
-1
No files found.
docker/ascend_npu/Dockerfile
View file @
8fb6794b
...
@@ -36,7 +36,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
...
@@ -36,7 +36,7 @@ RUN /bin/bash -c "wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/m
source /opt/mineru_venv/bin/activate &&
\
source /opt/mineru_venv/bin/activate &&
\
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple &&
\
pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple &&
\
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple &&
\
pip3 install torch==2.3.1 torchvision==0.18.1 -i https://mirrors.aliyun.com/pypi/simple &&
\
pip3 install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple &&
\
pip3 install -U magic-pdf[full]
'numpy<2' decorator attrs absl-py cloudpickle ml-dtypes tornado einops
-i https://mirrors.aliyun.com/pypi/simple &&
\
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl &&
\
wget https://gitee.com/ascend/pytorch/releases/download/v6.0.rc2-pytorch2.3.1/torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl &&
\
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
pip3 install torch_npu-2.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
...
...
magic_pdf/utils/office_to_pdf.py
View file @
8fb6794b
...
@@ -4,6 +4,8 @@ import platform
...
@@ -4,6 +4,8 @@ import platform
from
pathlib
import
Path
from
pathlib
import
Path
import
shutil
import
shutil
from
loguru
import
logger
class
ConvertToPdfError
(
Exception
):
class
ConvertToPdfError
(
Exception
):
def
__init__
(
self
,
msg
):
def
__init__
(
self
,
msg
):
...
@@ -11,35 +13,24 @@ class ConvertToPdfError(Exception):
...
@@ -11,35 +13,24 @@ class ConvertToPdfError(Exception):
super
().
__init__
(
self
.
msg
)
super
().
__init__
(
self
.
msg
)
# Chinese font list
REQUIRED_CHS_FONTS
=
[
'SimSun'
,
'Microsoft YaHei'
,
'Noto Sans CJK SC'
]
def
check_fonts_installed
():
def
check_fonts_installed
():
"""Check if required Chinese fonts are installed."""
"""Check if required Chinese fonts are installed."""
system_type
=
platform
.
system
()
system_type
=
platform
.
system
()
if
system_type
==
'Windows'
:
if
system_type
in
[
'Windows'
,
'Darwin'
]:
# Windows: check fonts via registry or system font folder
pass
font_dir
=
Path
(
"C:/Windows/Fonts"
)
installed_fonts
=
[
f
.
name
for
f
in
font_dir
.
glob
(
"*.ttf"
)]
if
any
(
font
for
font
in
REQUIRED_CHS_FONTS
if
any
(
font
in
f
for
f
in
installed_fonts
)):
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
else
:
else
:
# Linux
/macOS
: use fc-list
# Linux: use fc-list
try
:
try
:
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
for
font
in
REQUIRED_CHS_FONTS
:
if
output
.
strip
():
# 只要有任何输出(非空)
if
font
in
output
:
return
True
return
True
else
:
raise
EnvironmentError
(
logger
.
warning
(
f
"Missing
Chinese font
. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
f
"No
Chinese font
s were detected, the converted document may not display Chinese content properly.
"
)
)
except
Exception
as
e
:
except
Exception
:
raise
EnvironmentError
(
f
"Font detection failed. Please install 'fontconfig' and fonts:
{
str
(
e
)
}
"
)
pass
def
get_soffice_command
():
def
get_soffice_command
():
...
...
tests/unittest/test_table/test_rapidtable.py
View file @
8fb6794b
import
unittest
import
unittest
import
os
from
PIL
import
Image
from
PIL
import
Image
from
lxml
import
etree
from
lxml
import
etree
...
@@ -8,7 +9,7 @@ from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableM
...
@@ -8,7 +9,7 @@ from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableM
class
TestppTableModel
(
unittest
.
TestCase
):
class
TestppTableModel
(
unittest
.
TestCase
):
def
test_image2html
(
self
):
def
test_image2html
(
self
):
img
=
Image
.
open
(
"assets/table.jpg"
)
img
=
Image
.
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"assets/table.jpg"
)
)
atom_model_manager
=
AtomModelSingleton
()
atom_model_manager
=
AtomModelSingleton
()
ocr_engine
=
atom_model_manager
.
get_atom_model
(
ocr_engine
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
'ocr'
,
atom_model_name
=
'ocr'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment