Unverified Commit 034034c6 authored by sayThQ199's avatar sayThQ199 Committed by GitHub
Browse files

Update ext.py

Determine whether the file name with a.pdf extension supports the inclusion of special characters or Chinese characters.
parent 97b19eb3
import hashlib
import mimetypes
import urllib.parse
def is_pdf(filename, file):
"""
判断文件是否为PDF格式。
判断文件是否为PDF格式,支持中文名和特殊字符
:param filename: 文件名
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
# if not filename.endswith('.pdf'):
# return False
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(filename)
print(mime_type)
if mime_type != 'application/pdf':
return False
# 可选:读取文件的前几KB内容并检查MIME类型
# 这一步是可选的,用于更严格的检查
# if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
# return False
# 检查文件内容
file_start = file.read(5)
file.seek(0)
if not file_start.startswith(b'%PDF-'):
return False
return True
try:
# 对文件名进行URL解码,处理特殊字符
decoded_filename = urllib.parse.unquote(filename)
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(decoded_filename)
print(f"Detected MIME type: {mime_type}")
# 某些情况下mime_type可能为None,需要特殊处理
if mime_type is None:
# 只检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
return file_start.startswith(b'%PDF-')
if mime_type != 'application/pdf':
return False
# 检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
if not file_start.startswith(b'%PDF-'):
return False
return True
except Exception as e:
print(f"Error checking PDF format: {str(e)}")
# 发生错误时,仍然尝试通过文件头判断
try:
file_start = file.read(5)
file.seek(0)
return file_start.startswith(b'%PDF-')
except:
return False
def url_is_pdf(file):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment