utils.py

import re
import os
import json
import random
from typing import Tuple, List, Dict
import numpy as np
from PIL import Image, ImageDraw, ImageFont

def clean_repeated_substrings(text):
    """Clean repeated substrings in text"""
    n = len(text)
    if n<8000:
        return text
    for length in range(2, n // 10 + 1):
        candidate = text[-length:] 
        count = 0
        i = n - length
        
        while i >= 0 and text[i:i + length] == candidate:
            count += 1
            i -= length

        if count >= 10:
            return text[:n - length * (count - 1)]  

    return text

def norm_formula_HYOCR(gt_form,pre_form):
    """
    NORMARLIZE for OmniDocBench.input:gt_form,pred_form(from quick_match_display_formula)
    # End-to-end document parsing bypasses layout constraints and category definitions, 
    # yet the HunyuanOCR model produces outputs with more flexible granularities. 
    # For certain mathematical expressions, HunyuanOCR may parse them using a mix of plain texts and embedded LaTeX formulas. 
    # This can lead to mismatches when evaluated with OmniDocBench’s quick-match mechanism. 
    # Therefore, we manually correct cases with incorrect matches (i.e., those with an edit-distance score greater than 0.5). 
    # Below is a piece of logic code used to handle some normalization cases.
    """
    def clean_gt_tail(gt: str):
        pattern = r'(\\quad+|\\qquad+)\s*\{?\(\s*\d+\s*\)\}?\s*$'
        return re.sub(pattern, '', gt).rstrip()
    
    pre_form = clean_gt_tail(pre_form)
    gt_form = gt_form.replace('\[', '').replace('\]', '').replace('，', ',').strip()
    pre_form = pre_form.replace('\[', '').replace('\]', '').replace('，', ',').strip()
    pre_form = pre_form.replace("\\text{ⓐ}", "\\textcircled{a}") \
        .replace("\\text{ⓑ}", "\\textcircled{b}") \
        .replace("\\text{ⓒ}", "\\textcircled{c}") \
        .replace("\\text{ⓓ}", "\\textcircled{d}")
    
    return gt_form, pre_form


def parse_coords(coord_str: str) -> Tuple[float, float]:
    """Parse coordinate string and return (x,y) tuple"""
    try:
        x, y = coord_str.strip('()').split(',')
        return (float(x), float(y))
    except:
        return (0, 0)

def denormalize_coordinates(coord: Tuple[float, float], image_width: int, image_height: int) -> Tuple[int, int]:
    """Denormalize coordinates from [0,1000] to image dimensions""" 
    x, y = coord
    denorm_x = int(x * image_width / 1000)
    denorm_y = int(y * image_height / 1000)
    return (denorm_x, denorm_y)

def process_spotting_response(response: str, image_width: int, image_height: int) -> str:
    """Process spotting task response and denormalize coordinates"""
    try:
        # Find all text and coordinate pairs using regex
        pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
        matches = re.finditer(pattern, response)
        
        new_response = response
        for match in matches:
            text = match.group(1).strip()
            coords = match.group(2)
            
            # Parse the two coordinate points 
            coord_pattern = r'\((\d+),(\d+)\)'
            coord_matches = re.findall(coord_pattern, coords)
            if len(coord_matches) == 2:
                start_coord = (float(coord_matches[0][0]), float(coord_matches[0][1]))
                end_coord = (float(coord_matches[1][0]), float(coord_matches[1][1]))
                
                # Denormalize coordinates
                denorm_start = denormalize_coordinates(start_coord, image_width, image_height)
                denorm_end = denormalize_coordinates(end_coord, image_width, image_height)
                
                # Build new coordinate string
                new_coords = f"({denorm_start[0]},{denorm_start[1]}),({denorm_end[0]},{denorm_end[1]})"
                
                # Replace coordinates in original response
                new_response = new_response.replace(coords, new_coords)
        
        return new_response
    
    except Exception as e:
        print(f"Error processing response: {str(e)}")
        return response

def draw_text_detection_boxes(image: Image, response: str) -> Image:
    """Draw text detection boxes on image"""
    image_width, image_height = image.size
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    # Create transparent overlay
    overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
    draw_overlay = ImageDraw.Draw(overlay)
    
    try:
        font = ImageFont.load_default()
    except IOError:
        font = ImageFont.load_default()
        
    # Extract text and coordinates using regex
    pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
    matches = re.finditer(pattern, response)
    
    for match in matches:
        try:
            text = match.group(1).strip()
            coords = match.group(2)
            
            # Parse coordinates
            coord_pattern = r'\((\d+),(\d+)\)'
            coord_matches = re.findall(coord_pattern, coords)
            
            if len(coord_matches) == 2:
                x1, y1 = int(coord_matches[0][0]), int(coord_matches[0][1])
                x2, y2 = int(coord_matches[1][0]), int(coord_matches[1][1])
                
                # Generate random color
                color = (np.random.randint(0, 200), 
                        np.random.randint(0, 200),
                        np.random.randint(0, 255))
                color_alpha = color + (20,)
                
                # Draw rectangle and overlay
                draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
                draw_overlay.rectangle([x1, y1, x2, y2], 
                                    fill=color_alpha,
                                    outline=(0, 0, 0, 0))
                
                # Draw text label
                text_x = x1
                text_y = max(0, y1 - 15)
                text_bbox = draw.textbbox((0, 0), text, font=font)
                text_width = text_bbox[2] - text_bbox[0]
                text_height = text_bbox[3] - text_bbox[1]
                
                draw.rectangle([text_x, text_y, 
                              text_x + text_width, text_y + text_height],
                             fill=(255, 255, 255, 30))
                draw.text((text_x, text_y), text, font=font, fill=color)
                
        except Exception as e:
            print(f"Error drawing box: {str(e)}")
            continue
            
    # Combine image with overlay
    img_draw.paste(overlay, (0, 0), overlay)
    return img_draw

def main():
    """Main function to process images and visualize results"""
    # Read JSONL file
    jsonl_path = "/path/to/test_data.jsonl"
    output_dir = "output_visualizations"
    image_root = "/path/to/image_root"
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Read all lines from JSONL
    items = []
    with open(jsonl_path, 'r') as f:
        for line in f:
            items.append(json.loads(line.strip()))
            
    # Randomly select one item
    item = random.choice(items)
    
    # Get image path and response
    image_path = os.path.join(image_root, item["image_name"])
    response = clean_repeated_substrings(item["vllm-infer"])
    
    print(f"Processing image: {item['image_name']}")
    
    # Load and process image
    image = Image.open(image_path)
    image_width, image_height = image.size
    
    # Process response coordinates
    processed_response = process_spotting_response(response, image_width, image_height)
    print("Original response:", response)
    print("Processed response:", processed_response)
    
    # Draw detection boxes
    result_image = draw_text_detection_boxes(image, processed_response)
    
    # Save result using original image name
    output_path = os.path.join(output_dir, item["image_name"])
    result_image.save(output_path)
    print(f"Visualization saved to {output_path}")

if __name__ == "__main__":
    main()