Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
82a4376d
Commit
82a4376d
authored
Apr 14, 2025
by
Doge2077
Browse files
bugfix:While converting file to pdf, Chinese font will be ignored.
parent
47d287a2
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
5 deletions
+81
-5
docker/china/Dockerfile
docker/china/Dockerfile
+10
-0
docker/global/Dockerfile
docker/global/Dockerfile
+10
-0
magic_pdf/utils/office_to_pdf.py
magic_pdf/utils/office_to_pdf.py
+61
-5
No files found.
docker/china/Dockerfile
View file @
82a4376d
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
wget
\
git
\
git
\
libgl1
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
# Set Python 3.10 as the default python3
...
...
docker/global/Dockerfile
View file @
82a4376d
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
...
@@ -18,7 +18,17 @@ RUN apt-get update && \
wget
\
wget
\
git
\
git
\
libgl1
\
libgl1
\
libreoffice
\
fonts-noto-cjk
\
fonts-wqy-zenhei
\
fonts-wqy-microhei
\
ttf-mscorefonts-installer
\
fontconfig
\
libglib2.0-0
\
libglib2.0-0
\
libxrender1
\
libsm6
\
libxext6
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
# Set Python 3.10 as the default python3
# Set Python 3.10 as the default python3
...
...
magic_pdf/utils/office_to_pdf.py
View file @
82a4376d
import
os
import
os
import
subprocess
import
subprocess
import
platform
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -9,15 +10,70 @@ class ConvertToPdfError(Exception):
...
@@ -9,15 +10,70 @@ class ConvertToPdfError(Exception):
super
().
__init__
(
self
.
msg
)
super
().
__init__
(
self
.
msg
)
# Chinese font list
REQUIRED_CHS_FONTS
=
[
'SimSun'
,
'Microsoft YaHei'
,
'Noto Sans CJK SC'
]
def
check_fonts_installed
():
"""Check if required Chinese fonts are installed."""
system_type
=
platform
.
system
()
if
system_type
==
'Windows'
:
# Windows: check fonts via registry or system font folder
font_dir
=
Path
(
"C:/Windows/Fonts"
)
installed_fonts
=
[
f
.
name
for
f
in
font_dir
.
glob
(
"*.ttf"
)]
if
any
(
font
for
font
in
REQUIRED_CHS_FONTS
if
any
(
font
in
f
for
f
in
installed_fonts
)):
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
else
:
# Linux/macOS: use fc-list
try
:
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
for
font
in
REQUIRED_CHS_FONTS
:
if
font
in
output
:
return
True
raise
EnvironmentError
(
f
"Missing Chinese font. Please install at least one of:
{
', '
.
join
(
REQUIRED_CHS_FONTS
)
}
"
)
except
Exception
as
e
:
raise
EnvironmentError
(
f
"Font detection failed. Please install 'fontconfig' and fonts:
{
str
(
e
)
}
"
)
def
get_soffice_command
():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
if
platform
.
system
()
==
'Windows'
:
possible_paths
=
[
Path
(
"C:/Program Files/LibreOffice/program/soffice.exe"
),
Path
(
"C:/Program Files (x86)/LibreOffice/program/soffice.exe"
)
]
for
path
in
possible_paths
:
if
path
.
exists
():
return
str
(
path
)
raise
ConvertToPdfError
(
"LibreOffice not found. Please install LibreOffice and ensure soffice.exe is located in a standard path."
)
else
:
return
'soffice'
# Assume it's in PATH on Linux/macOS
def
convert_file_to_pdf
(
input_path
,
output_dir
):
def
convert_file_to_pdf
(
input_path
,
output_dir
):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if
not
os
.
path
.
isfile
(
input_path
):
if
not
os
.
path
.
isfile
(
input_path
):
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
check_fonts_installed
()
soffice_cmd
=
get_soffice_command
()
cmd
=
[
cmd
=
[
'
soffice
'
,
soffice
_cmd
,
'--headless'
,
'--headless'
,
'--norestore'
,
'--invisible'
,
'--convert-to'
,
'pdf'
,
'--convert-to'
,
'pdf'
,
'--outdir'
,
str
(
output_dir
),
'--outdir'
,
str
(
output_dir
),
str
(
input_path
)
str
(
input_path
)
...
@@ -26,4 +82,4 @@ def convert_file_to_pdf(input_path, output_dir):
...
@@ -26,4 +82,4 @@ def convert_file_to_pdf(input_path, output_dir):
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
if
process
.
returncode
!=
0
:
if
process
.
returncode
!=
0
:
raise
ConvertToPdfError
(
process
.
stderr
.
decode
())
raise
ConvertToPdfError
(
f
"LibreOffice convert failed:
{
process
.
stderr
.
decode
()
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment