run_mistral.py 2.32 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import json
import os
import tempfile

from mistralai import Mistral
from pypdf import PdfReader, PdfWriter

def run_mistral(pdf_path: str, page_num: int = 1) -> str:
    """
    Convert page of a PDF file to markdown using the mistral OCR api
    https://docs.mistral.ai/capabilities/document/

    Args:
        pdf_path (str): The local path to the PDF file.

    Returns:
        str: The OCR result in markdown format.
    """
    if not os.getenv("MISTRAL_API_KEY"):
        raise SystemExit("You must specify an MISTRAL_API_KEY")

    api_key = os.environ["MISTRAL_API_KEY"]
    client = Mistral(api_key=api_key)

    if page_num > 0:  # If a specific page is requested
        reader = PdfReader(pdf_path)
        
        # Check if the requested page exists
        if page_num > len(reader.pages):
            raise ValueError(f"Page {page_num} does not exist in the PDF. PDF has {len(reader.pages)} pages.")
        
        # Create a new PDF with just the requested page
        writer = PdfWriter()
        # pypdf uses 0-based indexing, so subtract 1 from page_num
        writer.add_page(reader.pages[page_num - 1])
        
        # Save the extracted page to a temporary file
        temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
        temp_file.close()  # Close the file but keep the name
        
        with open(temp_file.name, 'wb') as output_pdf:
            writer.write(output_pdf)
        
        pdf_to_process = temp_file.name
    else:
        pdf_to_process = pdf_path

    try:
        with open(pdf_to_process, "rb") as pf:
            uploaded_pdf = client.files.upload(
                file={
                    "file_name": os.path.basename(pdf_path),
                    "content": pf,
                },
                purpose="ocr"
            )  

        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

        ocr_response = client.ocr.process(
            model="mistral-ocr-2503",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            }
        )

        client.files.delete(file_id=uploaded_pdf.id)

        return ocr_response.pages[0].markdown
    finally:
        # Clean up the temporary file if it was created
        if temp_file and os.path.exists(temp_file.name):
            os.unlink(temp_file.name)