#!/usr/bin/env python3
import argparse
import json
import os
import re
import sys
from collections import defaultdict
from urllib.parse import parse_qs, urlencode, urlsplit, urlunsplit
import requests
from olmocr.data.renderpdf import render_pdf_to_base64png
def parse_rules_file(file_path):
"""Parse the rules file and organize rules by PDF."""
pdf_rules = defaultdict(list)
with open(file_path, "r") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
rule = json.loads(line)
# Add checked field if it doesn't exist
if "checked" not in rule:
rule["checked"] = None
if "pdf" in rule:
pdf_rules[rule["pdf"]].append(rule)
except json.JSONDecodeError:
print(f"Warning: Could not parse line as JSON: {line}")
return pdf_rules
def get_rule_html(rule, rule_index):
"""Generate HTML representation for a rule with interactive elements."""
rule_type = rule.get("type", "unknown")
rule_id = f"rule-{rule_index}"
# Determine status button class based on 'checked' value
checked_status = rule.get("checked")
thumbs_up_class = "active" if checked_status == "verified" else ""
thumbs_down_class = "active" if checked_status == "rejected" else ""
# Create thumbs up/down buttons
status_button = f"""
"""
# Create HTML based on rule type
if rule_type == "present":
return f"""
{status_button}
PRESENT
{rule.get('text', '')}
Threshold: {rule.get('threshold', 'N/A')}
"""
elif rule_type == "absent":
return f"""
{status_button}
ABSENT
{rule.get('text', '')}
Threshold: {rule.get('threshold', 'N/A')}
"""
elif rule_type == "order":
return f"""
{status_button}
ORDER
Before:{rule.get('before', '')}
After:{rule.get('after', '')}
Threshold: {rule.get('threshold', 'N/A')}
"""
else:
return f"""
{status_button}
UNKNOWN
Unknown rule type: {rule_type}
"""
def generate_html(pdf_rules, rules_file_path):
"""Generate the HTML page with PDF renderings and interactive rules."""
# Limit to 10 unique PDFs
pdf_names = list(pdf_rules.keys())[:10]
# Prepare rules data for JavaScript
all_rules = []
for pdf_name in pdf_names:
all_rules.extend(pdf_rules[pdf_name])
rules_json = json.dumps(all_rules)
html = """
Interactive PDF Rules Visualizer
Interactive PDF Rules Visualizer
"""
# Global rule index for unique IDs
rule_index = 0
for pdf_name in pdf_names:
rules = pdf_rules[pdf_name]
# Render the PDF (first page only) from the /pdfs folder
try:
pdf_path = os.path.join(os.path.dirname(rules_file_path), "pdfs", pdf_name)
base64_img = render_pdf_to_base64png(pdf_path, 0)
img_html = f''
except Exception as e:
img_html = f'
Error rendering PDF: {str(e)}
'
html += f"""
{pdf_name}
{img_html}
Status
Type
Content
Parameters
"""
for rule in rules:
html += get_rule_html(rule, rule_index)
rule_index += 1
html += """
"""
# Add JavaScript to manage interactivity and datastore integration
html += f"""
"""
return html
def get_page_datastore(html: str):
"""
Fetch the JSON datastore from the presigned URL.
Returns a dict. If any error or no content, returns {}.
"""
match = re.search(r"const presignedGetUrl = \"(.*?)\";", html)
if not match:
return None
presigned_url = match.group(1)
try:
# Clean up the presigned URL (sometimes the signature may need re-encoding)
url_parts = urlsplit(presigned_url)
query_params = parse_qs(url_parts.query)
encoded_query = urlencode(query_params, doseq=True)
cleaned_url = urlunsplit((url_parts.scheme, url_parts.netloc, url_parts.path, encoded_query, url_parts.fragment))
resp = requests.get(cleaned_url)
resp.raise_for_status()
return resp.json()
except Exception as e:
print(f"Error fetching datastore from {presigned_url}: {e}")
return None
def main():
parser = argparse.ArgumentParser(description="Generate an interactive HTML visualization of PDF rules.")
parser.add_argument("rules_file", help="Path to the rules file (JSON lines format)")
parser.add_argument("-o", "--output", help="Output HTML file path", default="interactive_pdf_rules.html")
args = parser.parse_args()
if not os.path.exists(args.rules_file):
print(f"Error: Rules file not found: {args.rules_file}")
sys.exit(1)
if os.path.exists(args.output):
print(f"Output file {args.output} already exists, attempting to reload it's datastore")
with open(args.output, "r") as df:
datastore = get_page_datastore(df.read())
if datastore is None:
print(f"Datastore for {args.output} is empty, please run tinyhost and verify your rules and then rerun the script")
sys.exit(1)
print(f"Loaded {len(datastore)} entries from datastore, updating {args.rules_file}")
with open(args.rules_file, "w") as of:
for rule in datastore:
of.write(json.dumps(rule) + "\n")
return
pdf_rules = parse_rules_file(args.rules_file)
html = generate_html(pdf_rules, args.rules_file)
with open(args.output, "w") as f:
f.write(html)
print(f"Interactive HTML visualization created: {args.output}")
if __name__ == "__main__":
main()