#!/usr/bin/env python3 import argparse import json import os import re import sys from collections import defaultdict from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit import requests from olmocr.data.renderpdf import render_pdf_to_base64png def parse_rules_file(file_path): """Parse the rules file and organize rules by PDF.""" pdf_rules = defaultdict(list) with open(file_path, "r") as f: for line in f: line = line.strip() if not line: continue try: rule = json.loads(line) # Add checked field if it doesn't exist if "checked" not in rule: rule["checked"] = None if "pdf" in rule: pdf_rules[rule["pdf"]].append(rule) except json.JSONDecodeError: print(f"Warning: Could not parse line as JSON: {line}") return pdf_rules def get_rule_html(rule, rule_index): """Generate HTML representation for a rule with interactive elements.""" rule_type = rule.get("type", "unknown") rule_id = f"rule-{rule_index}" # Determine status button class based on 'checked' value checked_status = rule.get("checked") thumbs_up_class = "active" if checked_status == "verified" else "" thumbs_down_class = "active" if checked_status == "rejected" else "" # Create thumbs up/down buttons status_button = f"""

""" # Create HTML based on rule type if rule_type == "present": return f""" {status_button} PRESENT

{rule.get('text', '')}

Threshold: {rule.get('threshold', 'N/A')} """ elif rule_type == "absent": return f""" {status_button} ABSENT

{rule.get('text', '')}

Threshold: {rule.get('threshold', 'N/A')} """ elif rule_type == "order": return f""" {status_button} ORDER

Before: {rule.get('before', '')}

After: {rule.get('after', '')}

Threshold: {rule.get('threshold', 'N/A')} """ else: return f""" {status_button} UNKNOWN Unknown rule type: {rule_type} """ def generate_html(pdf_rules, rules_file_path): """Generate the HTML page with PDF renderings and interactive rules.""" # Limit to 10 unique PDFs pdf_names = list(pdf_rules.keys())[:10] # Prepare rules data for JavaScript all_rules = [] for pdf_name in pdf_names: all_rules.extend(pdf_rules[pdf_name]) rules_json = json.dumps(all_rules) html = """ Interactive PDF Rules Visualizer

Interactive PDF Rules Visualizer

""" # Global rule index for unique IDs rule_index = 0 for pdf_name in pdf_names: rules = pdf_rules[pdf_name] # Render the PDF (first page only) from the /pdfs folder try: pdf_path = os.path.join(os.path.dirname(rules_file_path), "pdfs", pdf_name) base64_img = render_pdf_to_base64png(pdf_path, 0) img_html = f' ${pdf_name}$ ' except Exception as e: img_html = f'

Error rendering PDF: {str(e)}

' html += f"""

{pdf_name}

{img_html}

""" for rule in rules: html += get_rule_html(rule, rule_index) rule_index += 1 html += """

Status	Type	Content	Parameters

""" # Add JavaScript to manage interactivity and datastore integration html += f"""

""" return html def get_page_datastore(html: str): """ Fetch the JSON datastore from the presigned URL. Returns a dict. If any error or no content, returns {}. """ match = re.search(r"const presignedGetUrl = \"(.*?)\";", html) if not match: return None presigned_url = match.group(1) try: # Clean up the presigned URL (sometimes the signature may need re-encoding) url_parts = urlsplit(presigned_url) query_params = parse_qs(url_parts.query) encoded_query = urlencode(query_params, doseq=True) cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment)) resp = requests.get(cleaned_url) resp.raise_for_status() return resp.json() except Exception as e: print(f"Error fetching datastore from {presigned_url}: {e}") return None def main(): parser = argparse.ArgumentParser(description="Generate an interactive HTML visualization of PDF rules.") parser.add_argument("rules_file", help="Path to the rules file (JSON lines format)") parser.add_argument("-o", "--output", help="Output HTML file path", default="interactive_pdf_rules.html") args = parser.parse_args() if not os.path.exists(args.rules_file): print(f"Error: Rules file not found: {args.rules_file}") sys.exit(1) if os.path.exists(args.output): print(f"Output file {args.output} already exists, attempting to reload it's datastore") with open(args.output, "r") as df: datastore = get_page_datastore(df.read()) if datastore is None: print(f"Datastore for {args.output} is empty, please run tinyhost and verify your rules and then rerun the script") sys.exit(1) print(f"Loaded {len(datastore)} entries from datastore, updating {args.rules_file}") with open(args.rules_file, "w") as of: for rule in datastore: of.write(json.dumps(rule) + "\n") return pdf_rules = parse_rules_file(args.rules_file) html = generate_html(pdf_rules, args.rules_file) with open(args.output, "w") as f: f.write(html) print(f"Interactive HTML visualization created: {args.output}") if __name__ == "__main__": main()