run_claude.py 2.3 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
import os

from anthropic import Anthropic
from prompts import (
    build_openai_silver_data_prompt,
    claude_response_format_schema,
)

from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text


def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet-20250219", temperature: float = 0.1) -> str:
    """
    Convert page of a PDF file to markdown using Claude OCR.
    This function renders the specified page of the PDF to an image, runs OCR on that image,
    and returns the OCR result as a markdown-formatted string.

    Args:
        pdf_path (str): The local path to the PDF file.
        page_num (int): The page number to process (starting from 1).
        model (str): The Claude model to use.
        temperature (float): The temperature parameter for generation.

    Returns:
        str: The OCR result in markdown format.
    """

    if not os.getenv("ANTHROPIC_API_KEY"):
        raise SystemExit("You must specify an ANTHROPIC_API_KEY")

    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
    anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
    client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    response = client.messages.create(
        model=model,
        max_tokens=3000,
        temperature=temperature,
        # system=system_prompt,
        tools=claude_response_format_schema(),
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
                    {
                        "type": "text",
                        "text": f"{build_openai_silver_data_prompt(anchor_text)}. Use the page_response tool to respond. If the propeties are true, then extract the text from them and respond in natural_text.",
                    },
                ],
            }
        ],
    )

    json_sentiment = None
    for content in response.content:
        if content.type == "tool_use" and content.name == "page_response":
            json_sentiment = content.input
            break

    if json_sentiment:
        response = json.dumps(json_sentiment, indent=2)
        return response