""" Post-process for OminiDocBench Because HunYuanOCR is end-to-end parsing and ignores the restrictions of the pre-layout on the panel categories, the model's parsing results are diverse. Quick match may have about 8% false matches. We adopted a hierarchical paradigm: - Edit distance < 0.4: We consider this type of match a correct match and directly use it as Form_part1. - Edit distance >= 0.4: We believe this case may be caused by model parsing failure or incorrect matching. We adjust the match through a simple automated post-processing + manual post-processing paradigm. """ import json import re import os from difflib import SequenceMatcher from collections import Counter # ======================= Tool Functions ======================= def remove_big_braces(s: str): pattern = r'\\(big|Big|bigg|Bigg)\{([^\}]+)\}' repl = r'\\\1\2' return re.sub(pattern, repl, s) def process_final_ans(final_ans): for item in final_ans: if "pred" in item and isinstance(item["pred"], str): item["pred"] = remove_big_braces(item["pred"]) return final_ans def clean_gt_tail(gt: str): pattern = r'(\\quad+|\\qquad+)\s*\{?\(\s*\d+\s*\)\}?\s*$' return re.sub(pattern, '', gt).rstrip() def load_instances(jsonl_path): instances = [] with open(jsonl_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: obj = json.loads(line) instances.append(obj) except json.JSONDecodeError as e: print(f"[WARN] Parsing failed: {e}, skipped") print(f"[INFO] Read {len(instances)} markdown instances from {jsonl_path}" ) return instances def count_id_distribution(error_match_ans): """ Perform distribution statistics on the id field of error_match_ans (intermediate output) """ id_list = [item['id'] for item in error_match_ans if 'id' in item] counter = Counter(id_list) print("\n===============================") print("πŸ“Œ ID field distribution statistics (intermediate output)") print("===============================") print("Total number of records:", len(error_match_ans)) print("Number of records containing id:", len(id_list)) print("\n=== Distribution of the id field ===") for id_val, cnt in counter.most_common(): print(f"id={id_val} : {cnt} 欑") print("===============================\n") return counter # ---------- εŒΉι…ε·₯ε…· ---------- def normalize_for_match(text: str): text = re.sub(r"\\textcircled\{a\}", "ⓐ", text) text = re.sub(r"\\textcircled\{b\}", "β“‘", text) text = re.sub(r"\\textcircled\{c\}", "β“’", text) text = re.sub(r"\\textcircled\{d\}", "β““", text) text = text.replace("\\text{ⓐ}", "ⓐ") text = text.replace("\\text{β“‘}", "β“‘") text = text.replace("\\text{β“’}", "β“’") text = text.replace("\\text{β““}", "β““") text = text.replace(" ", "") return text def clean_formula(text: str): return (text.replace("\\quad", "") .replace("$", "") .strip()) def extract_candidates(markdown: str): lines = markdown.split("\n") candidates = [] for line in lines: line = line.strip() if not line: continue line = re.sub(r"^\s*\d+\.\s*", "", line) cleaned = clean_formula(line) if cleaned: candidates.append(cleaned) return candidates def best_match(gt: str, candidates): gt_norm = normalize_for_match(gt) best_score = -1 best_cand = None for cand in candidates: cand_norm = normalize_for_match(cand) score = SequenceMatcher(None, gt_norm, cand_norm).ratio() if score > best_score: best_score = score best_cand = cand return best_cand, best_score def process_badcases(Form_part2): results = [] for case in Form_part2: markdown = case["markdown"] gt = case["gt"] candidates = extract_candidates(markdown) pred, score = best_match(gt, candidates) pred = pred.replace("\\text{ⓐ}","\\textcircled{a}") \ .replace("\\text{β“‘}","\\textcircled{b}") \ .replace("\\text{β“’}","\\textcircled{c}") \ .replace("\\text{β““}","\\textcircled{d}") \ .replace("ⓐ","\\textcircled{a}") \ .replace("β“‘","\\textcircled{b}") \ .replace("β“’","\\textcircled{c}") \ .replace("β““","\\textcircled{d}") results.append({ 'img_id': case['img_id'], "gt": gt, "pred": pred, "match_score": score }) return results # ======================= Main Function ======================= def process_formula_matching(match_file, markdown_file, markdown_key, output_file): # ----------- Step1: Read the matching result file ----------- with open(match_file, "r", encoding="utf-8") as f: raw_data = json.load(f) final_ans = [] for idx, item in enumerate(raw_data): ref = item['gt'].replace('$', '') \ .replace('\[', '').replace('\]','') \ .replace(',', ',') \ .strip() pred = item['pred'].replace('$', '') \ .replace('\[', '').replace('\]','') \ .replace(',', ',') \ .strip() final_ans.append({ 'img_id': f"{idx}", 'id': f"{item['img_id']}", 'gt': ref, 'pred': pred, 'edit': item['edit'] }) final_ans = process_final_ans(final_ans) # ----------- Step2: Split Form_part1 / error_match ----------- Form_part1 = [] error_match_ans = [] for item in final_ans: item['pred'] = clean_gt_tail(item['pred']) if item['edit'] < 0.4: Form_part1.append(item) else: error_match_ans.append(item) distribution = count_id_distribution(error_match_ans) # ----------- Step3: Write markdown into error_match_ans ----------- markdown_data = load_instances(markdown_file) for item in markdown_data: basename = os.path.basename(item['image_path'][0]) for seq in error_match_ans: if basename == seq['id']: seq['markdown'] = item[markdown_key] # ----------- Step4: Special case handling for Form_part2 (id points to a specific fileοΌ‰ ----------- Form_part2 = [ x for x in error_match_ans if x['id'] == "yanbaopptmerge_9081a70ff98b3e7d640660a9412c447d.pdf_1287.jpg" ] # Matching bad samples out = process_badcases(Form_part2) # ----------- Step5: For regular error matching, substring rules are used directly. ----------- Form_part3 = [] for item in error_match_ans: if item['id'] == "yanbaopptmerge_9081a70ff98b3e7d640660a9412c447d.pdf_1287.jpg": continue gt = item['gt'].replace(' ', '') answer = item['markdown'].replace('$','').replace(' ','') if gt in answer: item['pred'] = item['gt'] Form_part3.append(item) # ----------- Step6: Combine all results and output. ----------- merge_form = Form_part1 + out + Form_part3 with open(output_file, "w", encoding="utf-8") as f: json.dump(merge_form, f, ensure_ascii=False, indent=4) print(f"[DONE] Saved final result to {output_file}") # ======================= END ======================= if __name__ == "__main__": process_formula_matching( match_file="vllm_omni_quick_match_display_formula_result.json",# Omnidocbenh quick match formula matching results markdown_file="OCR_OmniDocbench_vllm_infer_res.jsonl", # parsing result jsonl from vllm markdown_key="vllm_answer_eth", # answer key output_file="Final_formula.json" #The output file will be evaluated using the same method described in https://github.com/opendatalab/UniMERNet/tree/main/cdm. )