Unverified Commit 24b7e7ca authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2226 from Doge2077/master

fix:Chinese Character Garbling in PPTX/DOCX Conversion by Adding Font Check and Installation
parents ff35c755 87440ba4
...@@ -18,7 +18,17 @@ RUN apt-get update && \ ...@@ -18,7 +18,17 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
......
...@@ -18,7 +18,17 @@ RUN apt-get update && \ ...@@ -18,7 +18,17 @@ RUN apt-get update && \
wget \ wget \
git \ git \
libgl1 \ libgl1 \
libreoffice \
fonts-noto-cjk \
fonts-wqy-zenhei \
fonts-wqy-microhei \
ttf-mscorefonts-installer \
fontconfig \
libglib2.0-0 \ libglib2.0-0 \
libxrender1 \
libsm6 \
libxext6 \
poppler-utils \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Set Python 3.10 as the default python3 # Set Python 3.10 as the default python3
......
import os import os
import subprocess import subprocess
import platform
from pathlib import Path from pathlib import Path
import shutil
class ConvertToPdfError(Exception): class ConvertToPdfError(Exception):
...@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception): ...@@ -9,21 +11,114 @@ class ConvertToPdfError(Exception):
super().__init__(self.msg) super().__init__(self.msg)
# Chinese font list
REQUIRED_CHS_FONTS = ['SimSun', 'Microsoft YaHei', 'Noto Sans CJK SC']
def check_fonts_installed():
"""Check if required Chinese fonts are installed."""
system_type = platform.system()
if system_type == 'Windows':
# Windows: check fonts via registry or system font folder
font_dir = Path("C:/Windows/Fonts")
installed_fonts = [f.name for f in font_dir.glob("*.ttf")]
if any(font for font in REQUIRED_CHS_FONTS if any(font in f for f in installed_fonts)):
return True
raise EnvironmentError(
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
)
else:
# Linux/macOS: use fc-list
try:
output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
for font in REQUIRED_CHS_FONTS:
if font in output:
return True
raise EnvironmentError(
f"Missing Chinese font. Please install at least one of: {', '.join(REQUIRED_CHS_FONTS)}"
)
except Exception as e:
raise EnvironmentError(f"Font detection failed. Please install 'fontconfig' and fonts: {str(e)}")
def get_soffice_command():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type = platform.system()
# First check if soffice is in PATH
soffice_path = shutil.which('soffice')
if soffice_path:
return soffice_path
if system_type == 'Windows':
# Check common installation paths
possible_paths = [
Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
Path('C:/Program Files/LibreOffice/program/soffice.exe'),
Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
]
# Check other drives for windows
for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
for path in possible_paths:
if path.exists():
return str(path)
raise ConvertToPdfError(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else:
# For Linux/macOS, provide installation instructions if not found
try:
# Try to find soffice in standard locations
possible_paths = [
'/usr/bin/soffice',
'/usr/local/bin/soffice',
'/opt/libreoffice/program/soffice',
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for path in possible_paths:
if os.path.exists(path):
return path
raise ConvertToPdfError(
"LibreOffice not found. Please install it:\n"
" - Ubuntu/Debian: sudo apt-get install libreoffice\n"
" - CentOS/RHEL: sudo yum install libreoffice\n"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
" - Or ensure soffice is in your PATH environment variable."
)
except Exception as e:
raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
def convert_file_to_pdf(input_path, output_dir): def convert_file_to_pdf(input_path, output_dir):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if not os.path.isfile(input_path): if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.") raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
check_fonts_installed()
soffice_cmd = get_soffice_command()
cmd = [ cmd = [
'soffice', soffice_cmd,
'--headless', '--headless',
'--norestore',
'--invisible',
'--convert-to', 'pdf', '--convert-to', 'pdf',
'--outdir', str(output_dir), '--outdir', str(output_dir),
str(input_path) str(input_path)
] ]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0: if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode()) raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment