run_hy_ocr.py

import json
import base64
from openai import OpenAI
from tqdm import tqdm
from typing import Dict, List

def encode_image(image_path: str) -> str:
    """
    Encode image file to base64 string.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        Base64 encoded string of the image
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def create_chat_messages(image_path: str, prompt: str) -> List[Dict]:
    """
    Create chat messages with image and prompt.
    
    Args:
        image_path: Path to the image file
        prompt: Text prompt for the model
        
    Returns:
        List of message dictionaries
    """
    return [
        {"role": "system", "content": ""},
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encode_image(image_path)}"
                    }
                },
                {"type": "text", "text": prompt}
            ]
        }
    ]

def process_single_item(client: OpenAI, data: Dict) -> Dict:
    """
    Process a single data item through the VLLM API.
    
    Args:
        client: OpenAI client instance
        data: Input data dictionary
        
    Returns:
        Updated data dictionary with model response
    """
    # Extract image path and prompt
    img_path = data['image_path']
    prompt = data['question']
    
    # Create chat messages
    messages = create_chat_messages(img_path, prompt)
    
    # Get model response
    response = client.chat.completions.create(
        model="tencent/HunyuanOCR",
        messages=messages,
        temperature=0.0,
        top_p=0.95,
        seed=1234,
        stream=False,
        extra_body={
            "top_k": 1,
            "repetition_penalty": 1.0
        }
    )
    
    # Update data with model response
    data["vllm_answer"] = response.choices[0].message.content
    return data

def main():
    """Main function to process the JSONL file through VLLM API"""
    # Initialize OpenAI client
    client = OpenAI(
        api_key="EMPTY",
        base_url="http://localhost:8000/v1",
        timeout=3600
    )
    
    # Define input/output paths
    input_path = 'ominidoc_bench.jsonl'
    output_path = "infer_result_ominidoc_bench.jsonl"
    
    # Process data
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:
        
        # Iterate through input file
        for line in tqdm(fin, desc="Processing documents"):
            if not line.strip():
                continue
                
            try:
                # Load and process data
                data = json.loads(line)
                processed_data = process_single_item(client, data)
                
                # Write results
                fout.write(json.dumps(processed_data, ensure_ascii=False) + "\n")
            except Exception as e:
                print(f"Error processing line: {str(e)}")
                continue
    
    print(f"Processing completed. Results saved to: {output_path}")

if __name__ == "__main__":
    main()