Merge pull request #1743 from xuzijie1995/master

Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters

Merge pull request #1743 from xuzijie1995/master
Updata ext.py is_pdf function to support the pdf with Chinese characters and special characters
08dd3a85 · Xiaomeng Zhao · GitHub · 8dd01346 · 034034c6 · 08dd3a85
Unverified Commit 08dd3a85 authored Feb 24, 2025 by Xiaomeng Zhao Committed by GitHub Feb 24, 2025
3 changed files
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -123,4 +123,5 @@ jobs:
      - name: Publish distribution to PyPI
        run: |
          pip install twine
+          twine check dist/*
          twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
 import hashlib
 import mimetypes
+import urllib.parse
 def is_pdf(filename, file):
    """
-    判断文件是否为PDF格式。
+    判断文件是否为PDF格式，支持中文名和特殊字符。
    :param filename: 文件名
    :param file: 文件对象
    :return: 如果文件是PDF格式，则返回True，否则返回False
    """
-    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况，先注释
+    try:
-    # if not filename.endswith('.pdf'):
+        # 对文件名进行URL解码，处理特殊字符
-    #     return False
+        decoded_filename = urllib.parse.unquote(filename)
-    # 检查MIME类型
+        # 检查MIME类型
-    mime_type, _ = mimetypes.guess_type(filename)
+        mime_type, _ = mimetypes.guess_type(decoded_filename)
-    print(mime_type)
+        print(f"Detected MIME type: {mime_type}")
-    if mime_type != 'application/pdf':
-        return False
+        # 某些情况下mime_type可能为None，需要特殊处理
+        if mime_type is None:
-    # 可选：读取文件的前几KB内容并检查MIME类型
+            # 只检查文件内容的PDF标识
-    # 这一步是可选的，用于更严格的检查
+            file_start = file.read(5)
-    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
+            file.seek(0)  # 重置文件指针
-    #     return False
+            return file_start.startswith(b'%PDF-')
-    # 检查文件内容
+        if mime_type != 'application/pdf':
-    file_start = file.read(5)
+            return False
-    file.seek(0)
-    if not file_start.startswith(b'%PDF-'):
+        # 检查文件内容的PDF标识
-        return False
+        file_start = file.read(5)
+        file.seek(0)  # 重置文件指针
-    return True
+        if not file_start.startswith(b'%PDF-'):
+            return False
+        return True
+    except Exception as e:
+        print(f"Error checking PDF format: {str(e)}")
+        # 发生错误时，仍然尝试通过文件头判断
+        try:
+            file_start = file.read(5)
+            file.seek(0)
+            return file_start.startswith(b'%PDF-')
+        except:
+            return False
 def url_is_pdf(file):

--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -143,6 +143,14 @@
      "created_at": "2025-01-20T05:30:38Z",
      "repoId": 765083837,
      "pullRequestNo": 1578
+    },
+    {
+      "name": "shniubobo",
+      "id": 6594544,
+      "comment_id": 2660086464,
+      "created_at": "2025-02-14T19:15:25Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1693
    }
  ]
 }
\ No newline at end of file