"vscode:/vscode.git/clone" did not exist on "ca9e656a5c7957fa4d737fd6ae5494ccd8386170"
Unverified Commit 08dd3a85 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1743 from xuzijie1995/master

Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters
parents 8dd01346 034034c6
......@@ -123,4 +123,5 @@ jobs:
- name: Publish distribution to PyPI
run: |
pip install twine
twine check dist/*
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
import hashlib
import mimetypes
import urllib.parse
def is_pdf(filename, file):
"""
判断文件是否为PDF格式。
判断文件是否为PDF格式,支持中文名和特殊字符
:param filename: 文件名
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
# if not filename.endswith('.pdf'):
# return False
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(filename)
print(mime_type)
if mime_type != 'application/pdf':
return False
# 可选:读取文件的前几KB内容并检查MIME类型
# 这一步是可选的,用于更严格的检查
# if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
# return False
# 检查文件内容
file_start = file.read(5)
file.seek(0)
if not file_start.startswith(b'%PDF-'):
return False
return True
try:
# 对文件名进行URL解码,处理特殊字符
decoded_filename = urllib.parse.unquote(filename)
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(decoded_filename)
print(f"Detected MIME type: {mime_type}")
# 某些情况下mime_type可能为None,需要特殊处理
if mime_type is None:
# 只检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
return file_start.startswith(b'%PDF-')
if mime_type != 'application/pdf':
return False
# 检查文件内容的PDF标识
file_start = file.read(5)
file.seek(0) # 重置文件指针
if not file_start.startswith(b'%PDF-'):
return False
return True
except Exception as e:
print(f"Error checking PDF format: {str(e)}")
# 发生错误时,仍然尝试通过文件头判断
try:
file_start = file.read(5)
file.seek(0)
return file_start.startswith(b'%PDF-')
except:
return False
def url_is_pdf(file):
......
......@@ -143,6 +143,14 @@
"created_at": "2025-01-20T05:30:38Z",
"repoId": 765083837,
"pullRequestNo": 1578
},
{
"name": "shniubobo",
"id": 6594544,
"comment_id": 2660086464,
"created_at": "2025-02-14T19:15:25Z",
"repoId": 765083837,
"pullRequestNo": 1693
}
]
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment