Update ext.py

Determine whether the file name with a.pdf extension supports the inclusion of special characters or Chinese characters.

Update ext.py
Determine whether the file name with a.pdf extension supports the inclusion of special characters or Chinese characters.
034034c6 · sayThQ199 · GitHub · 97b19eb3 · 034034c6
Unverified Commit 034034c6 authored Feb 22, 2025 by sayThQ199 Committed by GitHub Feb 22, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 23 deletions

projects/web_demo/web_demo/common/ext.py projects/web_demo/web_demo/common/ext.py +37 -23

No files found.
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
 import hashlib
 import mimetypes
+import urllib.parse


 def is_pdf(filename, file):
    """
-    判断文件是否为PDF格式。
+    判断文件是否为PDF格式，支持中文名和特殊字符。

    :param filename: 文件名
    :param file: 文件对象
    :return: 如果文件是PDF格式，则返回True，否则返回False
    """
-    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况，先注释
-    # if not filename.endswith('.pdf'):
-    #     return False
-
-    # 检查MIME类型
-    mime_type, _ = mimetypes.guess_type(filename)
-    print(mime_type)
-    if mime_type != 'application/pdf':
-        return False
-
-    # 可选：读取文件的前几KB内容并检查MIME类型
-    # 这一步是可选的，用于更严格的检查
-    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
-    #     return False
-
-    # 检查文件内容
-    file_start = file.read(5)
-    file.seek(0)
-    if not file_start.startswith(b'%PDF-'):
-        return False
-
-    return True
+    try:
+        # 对文件名进行URL解码，处理特殊字符
+        decoded_filename = urllib.parse.unquote(filename)
+        
+        # 检查MIME类型
+        mime_type, _ = mimetypes.guess_type(decoded_filename)
+        print(f"Detected MIME type: {mime_type}")
+        
+        # 某些情况下mime_type可能为None，需要特殊处理
+        if mime_type is None:
+            # 只检查文件内容的PDF标识
+            file_start = file.read(5)
+            file.seek(0)  # 重置文件指针
+            return file_start.startswith(b'%PDF-')
+            
+        if mime_type != 'application/pdf':
+            return False
+
+        # 检查文件内容的PDF标识
+        file_start = file.read(5)
+        file.seek(0)  # 重置文件指针
+        if not file_start.startswith(b'%PDF-'):
+            return False
+
+        return True
+        
+    except Exception as e:
+        print(f"Error checking PDF format: {str(e)}")
+        # 发生错误时，仍然尝试通过文件头判断
+        try:
+            file_start = file.read(5)
+            file.seek(0)
+            return file_start.startswith(b'%PDF-')
+        except:
+            return False


 def url_is_pdf(file):