"...text-generation-inference.git" did not exist on "a5593ba83ef6d2edd3406497e3ed0573a86e44b6"
Unverified Commit 08dd3a85 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1743 from xuzijie1995/master

Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters
parents 8dd01346 034034c6
...@@ -123,4 +123,5 @@ jobs: ...@@ -123,4 +123,5 @@ jobs:
- name: Publish distribution to PyPI - name: Publish distribution to PyPI
run: | run: |
pip install twine pip install twine
twine check dist/*
twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
import hashlib import hashlib
import mimetypes import mimetypes
import urllib.parse
def is_pdf(filename, file): def is_pdf(filename, file):
""" """
判断文件是否为PDF格式。 判断文件是否为PDF格式,支持中文名和特殊字符
:param filename: 文件名 :param filename: 文件名
:param file: 文件对象 :param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False :return: 如果文件是PDF格式,则返回True,否则返回False
""" """
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释 try:
# if not filename.endswith('.pdf'): # 对文件名进行URL解码,处理特殊字符
# return False decoded_filename = urllib.parse.unquote(filename)
# 检查MIME类型 # 检查MIME类型
mime_type, _ = mimetypes.guess_type(filename) mime_type, _ = mimetypes.guess_type(decoded_filename)
print(mime_type) print(f"Detected MIME type: {mime_type}")
if mime_type != 'application/pdf':
return False # 某些情况下mime_type可能为None,需要特殊处理
if mime_type is None:
# 可选:读取文件的前几KB内容并检查MIME类型 # 只检查文件内容的PDF标识
# 这一步是可选的,用于更严格的检查 file_start = file.read(5)
# if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf': file.seek(0) # 重置文件指针
# return False return file_start.startswith(b'%PDF-')
# 检查文件内容 if mime_type != 'application/pdf':
file_start = file.read(5) return False
file.seek(0)
if not file_start.startswith(b'%PDF-'): # 检查文件内容的PDF标识
return False file_start = file.read(5)
file.seek(0) # 重置文件指针
return True if not file_start.startswith(b'%PDF-'):
return False
return True
except Exception as e:
print(f"Error checking PDF format: {str(e)}")
# 发生错误时,仍然尝试通过文件头判断
try:
file_start = file.read(5)
file.seek(0)
return file_start.startswith(b'%PDF-')
except:
return False
def url_is_pdf(file): def url_is_pdf(file):
......
...@@ -143,6 +143,14 @@ ...@@ -143,6 +143,14 @@
"created_at": "2025-01-20T05:30:38Z", "created_at": "2025-01-20T05:30:38Z",
"repoId": 765083837, "repoId": 765083837,
"pullRequestNo": 1578 "pullRequestNo": 1578
},
{
"name": "shniubobo",
"id": 6594544,
"comment_id": 2660086464,
"created_at": "2025-02-14T19:15:25Z",
"repoId": 765083837,
"pullRequestNo": 1693
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment