Commit bffed0fe authored by dengjb's avatar dengjb
Browse files

update

parents
{
"name": "match-at",
"version": "0.1.0",
"description": "Relocatable regular expressions.",
"repository": {
"type": "git",
"url": "https://github.com/spicyj/match-at"
},
"main": "lib/matchAt.js",
"files": [
"lib/"
],
"devDependencies": {
"babel": "^4.7.16",
"jest-cli": "^0.4.0",
"react-tools": "^0.13.1"
},
"jest": {
"scriptPreprocessor": "<rootDir>/jestSupport/preprocessor.js",
"unmockedModulePathPatterns": [
""
]
},
"scripts": {
"prepublish": "babel -d lib/ src/",
"test": "jest"
},
"gitHead": "4197daff69720734c72ba3321ed68a41c0527fb2",
"bugs": {
"url": "https://github.com/spicyj/match-at/issues"
},
"homepage": "https://github.com/spicyj/match-at",
"_id": "match-at@0.1.0",
"_shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31",
"_from": "match-at@",
"_npmVersion": "2.2.0",
"_nodeVersion": "0.10.35",
"_npmUser": {
"name": "spicyj",
"email": "ben@benalpert.com"
},
"maintainers": [
{
"name": "spicyj",
"email": "ben@benalpert.com"
}
],
"dist": {
"shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31",
"tarball": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz"
},
"directories": {},
"_resolved": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz"
}
# taken and modified from https://github.com/harvardnlp/im2markup
# tokenize latex formulas
import sys
import os
import re
import argparse
import subprocess
import shutil
from threading import Timer
from datetime import datetime
def run_cmd(cmd, timeout_sec=30):
proc = subprocess.Popen(cmd, shell=True)
kill_proc = lambda p: p.kill()
timer = Timer(timeout_sec, kill_proc, [proc])
try:
timer.start()
stdout,stderr = proc.communicate()
finally:
timer.cancel()
def tokenize_latex(latex_code, latex_type="", middle_file=""):
if not latex_code:
return False, latex_code
if not latex_type:
latex_type = "tabular" if "tabular" in latex_code else "formula"
if not middle_file:
middle_file = "out-" + datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + ".txt"
temp_file = middle_file + '.tmp'
if latex_type == "formula":
with open(temp_file, 'w') as f:
prepre = latex_code
# replace split, align with aligned
prepre = re.sub(r'\\begin{(split|align|alignedat|alignat|eqnarray)\*?}(.+?)\\end{\1\*?}', r'\\begin{aligned}\2\\end{aligned}', prepre, flags=re.S)
prepre = re.sub(r'\\begin{(smallmatrix)\*?}(.+?)\\end{\1\*?}', r'\\begin{matrix}\2\\end{matrix}', prepre, flags=re.S)
f.write(prepre)
cmd = r"cat %s | node %s %s > %s " % (temp_file, os.path.join(os.path.dirname(__file__), 'preprocess_formula.js'), 'normalize', middle_file)
ret = subprocess.call(cmd, shell=True)
os.remove(temp_file)
if ret != 0:
return False, latex_code
operators = '\s?'.join('|'.join(['arccos', 'arcsin', 'arctan', 'arg', 'cos', 'cosh', 'cot', 'coth', 'csc', 'deg', 'det', 'dim', 'exp', 'gcd', 'hom', 'inf',
'injlim', 'ker', 'lg', 'lim', 'liminf', 'limsup', 'ln', 'log', 'max', 'min', 'Pr', 'projlim', 'sec', 'sin', 'sinh', 'sup', 'tan', 'tanh']))
ops = re.compile(r'\\operatorname {(%s)}' % operators)
with open(middle_file, 'r') as fin:
for line in fin:
tokens = line.strip().split()
tokens_out = []
for token in tokens:
tokens_out.append(token)
post = ' '.join(tokens_out)
# use \sin instead of \operatorname{sin}
names = ['\\'+x.replace(' ', '') for x in re.findall(ops, post)]
post = re.sub(ops, lambda match: str(names.pop(0)), post).replace(r'\\ \end{array}', r'\end{array}')
os.remove(middle_file)
return True, post
elif latex_type == "tabular":
latex_code = latex_code.replace("\\\\%", "\\\\ %")
latex_code = latex_code.replace("\%", "<PERCENTAGE_TOKEN>")
latex_code = latex_code.split('%')[0]
latex_code = latex_code.replace("<PERCENTAGE_TOKEN>", "\%")
if not "\\end{tabular}" in latex_code:
latex_code += "\\end{tabular}"
with open(middle_file, 'w') as f:
f.write(latex_code.replace('\r', ' ').replace('\n', ' '))
cmd = "perl -pe 's|hskip(.*?)(cm\\|in\\|pt\\|mm\\|em)|hspace{\\1\\2}|g' %s > %s"%(middle_file, temp_file)
ret = subprocess.call(cmd, shell=True)
if ret != 0:
return False, latex_code
os.remove(middle_file)
cmd = r"cat %s | node %s %s > %s " % (temp_file, os.path.join(os.path.dirname(__file__), 'preprocess_tabular.js'), 'tokenize', middle_file)
ret = subprocess.call(cmd, shell=True)
os.remove(temp_file)
if ret != 0:
return False, latex_code
with open(middle_file, 'r') as fin:
for line in fin:
tokens = line.strip().split()
tokens_out = []
for token in tokens:
tokens_out.append(token)
post = ' '.join(tokens_out)
os.remove(middle_file)
return True, post
else:
print(f"latex type{latex_type} unrecognized.")
return False, latex_code
if __name__ == '__main__':
latex_code = open("2.txt", 'r').read().replace('\r', ' ')
print("=>", latex_code)
new_code = tokenize_latex(latex_code)
print("=>", new_code)
\ No newline at end of file
import time
import numpy as np
from PIL import Image
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
class SimpleAffineTransform:
"""
simple affine transform, only translation and scale.
"""
def __init__(self, translation=(0, 0), scale=1.0):
self.translation = np.array(translation)
self.scale = scale
def estimate(self, src, dst):
src_center = np.mean(src, axis=0)
dst_center = np.mean(dst, axis=0)
self.translation = dst_center - src_center
src_dists = np.linalg.norm(src - src_center, axis=1)
dst_dists = np.linalg.norm(dst - dst_center, axis=1)
self.scale = np.mean(dst_dists) / (np.mean(src_dists) + 1e-10)
def inverse(self):
inverse_transform = AffineTransform(-self.translation, 1.0/self.scale)
return inverse_transform
def __call__(self, coords):
return self.scale * (coords - np.mean(coords, axis=0)) + np.mean(coords, axis=0) + self.translation
def residuals(self, src, dst):
return np.sqrt(np.sum((self(src) - dst) ** 2, axis=1))
def norm_coords(x, left, right):
if x < left:
return left
if x > right:
return right
return x
def norm_same_token(token):
special_map = {
"\\dot": ".",
"\\Dot": ".",
"\\cdot": ".",
"\\cdotp": ".",
"\\ldotp": ".",
"\\mid": "|",
"\\rightarrow": "\\to",
"\\top": "T",
"\\Tilde": "\\tilde",
"\\prime": "'",
"\\ast": "*",
"\\left<": "\\langle",
"\\right>": "\\rangle",
"\\lbrace": "\{",
"\\rbrace": "\}",
"\\lbrack": "[",
"\\rbrack": "]",
"\\blackslash": "/",
"\\slash": "/",
"\\leq": "\\le",
"\\geq": "\\ge",
"\\neq": "\\ne",
"\\Vert": "\\|",
"\\lVert": "\\|",
"\\rVert": "\\|",
"\\vert": "|",
"\\lvert": "|",
"\\rvert": "|",
"\\colon": ":",
"\\Ddot": "\\ddot",
"\\Bar": "\\bar",
"\\Vec": "\\vec",
"\\parallel": "\\|",
"\\dag": "\\dagger",
"\\ddag": "\\ddagger",
"\\textlangle": "<",
"\\textrangle": ">",
"\\textgreater": ">",
"\\textless": "<",
"\\textbackslash": "n",
"\\textunderscore": "_",
"\\=": "_",
"\\neg": "\\lnot",
"\\neq": "\\not=",
}
if token.startswith('\\left') or token.startswith('\\right'):
if "arrow" not in token and "<" not in token and ">" not in token and "harpoon" not in token:
token = token.replace("\\left", "").replace("\\right", "")
if token.startswith('\\big') or token.startswith('\\Big'):
if "\\" in token[4:]:
token = "\\"+token[4:].split("\\")[-1]
else:
token = token[-1]
if token in special_map.keys():
token = special_map[token]
if token.startswith('\\wide'):
return token.replace("wide", "")
if token.startswith('\\var'):
return token.replace("var", "")
if token.startswith('\\string'):
return token.replace("\\string", "")
return token
class HungarianMatcher:
def __init__(
self,
cost_token: float = 1,
cost_position: float = 0.05,
cost_order: float = 0.15,
):
self.cost_token = cost_token
self.cost_position = cost_position
self.cost_order = cost_order
self.cost = {}
def calculate_token_cost(self, box_gt, box_pred):
token2id = {}
for data in box_gt+box_pred:
if data['token'] not in token2id:
token2id[data['token']] = len(token2id)
num_classes = len(token2id)
token2id_norm = {}
for data in box_gt+box_pred:
if norm_same_token(data['token']) not in token2id_norm:
token2id_norm[norm_same_token(data['token'])] = len(token2id_norm)
num_classes_norm = len(token2id_norm)
gt_token_array = []
norm_gt_token_array = []
for data in box_gt:
gt_token_array.append(token2id[data['token']])
norm_gt_token_array.append(token2id_norm[norm_same_token(data['token'])])
pred_token_logits = []
norm_pred_token_logits = []
for data in box_pred:
logits = [0] * num_classes
logits[token2id[data['token']]] = 1
pred_token_logits.append(logits)
logits_norm = [0] * num_classes_norm
logits_norm[token2id_norm[norm_same_token(data['token'])]] = 1
norm_pred_token_logits.append(logits_norm)
gt_token_array = np.array(gt_token_array)
pred_token_logits = np.array(pred_token_logits)
norm_gt_token_array = np.array(norm_gt_token_array)
norm_pred_token_logits = np.array(norm_pred_token_logits)
token_cost = 1.0 - pred_token_logits[:, gt_token_array]
norm_token_cost = 1.0 - norm_pred_token_logits[:, norm_gt_token_array]
token_cost[np.logical_and(token_cost==1, norm_token_cost==0)] = 0.005
return token_cost.T
def box2array(self, box_list, size):
W, H = size
box_array = []
for box in box_list:
x_min, y_min, x_max, y_max = box['bbox']
box_array.append([x_min/W, y_min/H, x_max/W, y_max/H])
return np.array(box_array)
def order2array(self, box_list, max_token_lens=None):
if not max_token_lens:
max_token_lens = len(box_list)
order_array = []
for idx, box in enumerate(box_list):
order_array.append([idx / max_token_lens])
return np.array(order_array)
def calculate_l1_cost(self, gt_array, pred_array):
scale = gt_array.shape[-1]
l1_cost = cdist(gt_array, pred_array, 'minkowski', p=1)
return l1_cost / scale
def __call__(self, box_gt, box_pred, gt_size, pred_size):
aa = time.time()
gt_box_array = self.box2array(box_gt, gt_size)
pred_box_array = self.box2array(box_pred, pred_size)
max_token_lens = max(len(box_gt), len(box_pred))
gt_order_array = self.order2array(box_gt, max_token_lens)
pred_order_array = self.order2array(box_pred, max_token_lens)
token_cost = self.calculate_token_cost(box_gt, box_pred)
position_cost = self.calculate_l1_cost(gt_box_array, pred_box_array)
order_cost = self.calculate_l1_cost(gt_order_array, pred_order_array)
self.cost["token"] = token_cost
self.cost["position"] = position_cost
self.cost["order"] = order_cost
cost = self.cost_token * token_cost + self.cost_position * position_cost + self.cost_order * order_cost
cost[np.isnan(cost) | np.isinf(cost)] = 100
indexes = linear_sum_assignment(cost)
matched_idxes = []
for a, b in zip(*indexes):
matched_idxes.append((a, b))
return matched_idxes
\ No newline at end of file
tqdm
matplotlib
numpy<2.0.0
scikit-image<=0.20.0
opencv-python
gradio==4.43.0 # optional
\ No newline at end of file
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: True
pretrained: './models/unimernet_base/unimernet_base.pth'
tokenizer_config:
path: ./models/unimernet_base
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
\ No newline at end of file
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: False
load_finetuned: False
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_base_encoder6666_decoder8_dim1024_30w_8xb8_f1_lr1e_4"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_small
max_seq_len: 1536
load_pretrained: False
load_finetuned: False
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_small_encoder6666_decoder8_dim1024_30w_8xb8_f1_lr1e_4"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_tiny
max_seq_len: 1536
load_pretrained: False
load_finetuned: False
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_base_encoder6666_decoder8_dim1024_30w_8xb8_f1_lr1e_4"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: False
load_finetuned: True
finetuned: './models/unimernet_base.pth'
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_base"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_small
max_seq_len: 1536
load_pretrained: False
load_finetuned: True
finetuned: './models/unimernet_small.pth'
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_small"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_tiny
max_seq_len: 1536
load_pretrained: False
load_finetuned: True
finetuned: './models/unimernet_tiny.pth'
datasets:
formula_rec_train:
sample_ratio: 1
vis_processor:
train:
name: "formula_image_train"
image_size:
- 192
- 672
text_processor:
train:
name: "blip_caption"
max_words: 1536
build_info:
# unimath_train
images: ./data/UniMath1M/train/unimath_train
annotation: ./data/UniMath1M/train/unimath_train.txt
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
text_processor:
eval:
name: "blip_caption"
max_words: 1536
build_info:
images: ./data/UniMER-Test/cpe
annotation: ./data/UniMER-Test/cpe.txt
run:
runner: runner_iter
task: unimernet_train
# optimizer
lr_sched: "linear_warmup_cosine_lr"
init_lr: 1e-4
min_lr: 1e-8
warmup_lr: 1e-5
weight_decay: 0.05
batch_size_train: 8
batch_size_eval: 8
accum_grad_iters: 1
num_workers: 8
warmup_steps: 5000
iters_per_inner_epoch: 20000
max_iters: 300000
milestone: [1]
seed: 42
output_dir: "../outputs_unimernet/unimernet_tiny"
amp: True
resume_ckpt_path: null
evaluate: False
train_splits: [ "train" ]
valid_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Image processing class: accepts formula images, outputs LaTeX code and rendered images.\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/bin/anaconda3/envs/unimernetv2_pip/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/Users/bin/anaconda3/envs/unimernetv2_pip/lib/python3.10/site-packages/torchtext/data/__init__.py:4: UserWarning: \n",
"/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
"Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
" warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"
]
}
],
"source": [
"import argparse\n",
"import os\n",
"import random\n",
"import sys\n",
"\n",
"from IPython.display import display, Math\n",
"from PIL import Image\n",
"from rich import print as rprint\n",
"from rich.panel import Panel\n",
"from rich.rule import Rule\n",
"from rich.table import Table\n",
"from termcolor import colored\n",
"import torch\n",
"\n",
"sys.path.insert(0, os.path.join(os.getcwd(), \"..\"))\n",
"from unimernet.common.config import Config\n",
"from unimernet.datasets.builders import *\n",
"from unimernet.models import *\n",
"from unimernet.processors import *\n",
"import unimernet.tasks as tasks\n",
"from unimernet.processors import load_processor\n",
"\n",
"class ImageProcessor:\n",
" \n",
" def __init__(self, cfg_path, image_dir):\n",
" self.cfg_path = cfg_path\n",
" self.image_dir = image_dir\n",
" self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
" self.model, self.vis_processor = self.load_model_and_processor()\n",
"\n",
" def load_model_and_processor(self):\n",
" args = argparse.Namespace(cfg_path=self.cfg_path, options=None)\n",
" cfg = Config(args)\n",
" task = tasks.setup_task(cfg)\n",
" model = task.build_model(cfg).to(self.device)\n",
" vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)\n",
"\n",
" return model, vis_processor\n",
"\n",
" def process_single_image(self, image_path):\n",
" try:\n",
" raw_image = Image.open(image_path)\n",
" except IOError:\n",
" print(f\"Error: Unable to open image at {image_path}\")\n",
" return\n",
"\n",
" resized_image = self.resize_image(raw_image)\n",
" image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)\n",
" output = self.model.generate({\"image\": image})\n",
" pred = output[\"pred_str\"][0]\n",
" self.print_result(0, image_path, resized_image, pred)\n",
" rprint(Rule(style=\"black\"))\n",
"\n",
" def process_images(self):\n",
" image_names = os.listdir(self.image_dir)\n",
" image_paths = [os.path.join(self.image_dir, name) for name in image_names]\n",
"\n",
" for id, image_path in enumerate(image_paths):\n",
" raw_image = Image.open(image_path)\n",
" resized_image = self.resize_image(raw_image)\n",
" image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)\n",
" output = self.model.generate({\"image\": image})\n",
" pred = output[\"pred_str\"][0]\n",
" self.print_result(id, image_path, resized_image, pred)\n",
" rprint(Rule(style=\"black\"))\n",
"\n",
" @staticmethod\n",
" def resize_image(image, max_len=600):\n",
" width, height = image.size\n",
" if max(width, height) > max_len :\n",
" if width > height:\n",
" scale = float(max_len) / width\n",
" width = max_len\n",
" height = int(height * scale)\n",
" else:\n",
" scale = float(max_len) / height\n",
" height = max_len\n",
" width = int(width * scale)\n",
"\n",
" return image.resize((width, height))\n",
"\n",
" @staticmethod\n",
" def print_result(id, image_path, raw_image, pred):\n",
" colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']\n",
" chosen_color = random.choice(colors)\n",
"\n",
" table = Table(show_header=True, header_style=chosen_color)\n",
" table.add_column(\"Sample ID\", style=\"dim\", width=12)\n",
" table.add_column(\"Image Path\", style=\"dim\", width=80)\n",
" table.add_row(str(id), image_path)\n",
" rprint(table)\n",
" print(colored(f\"{id}_1: Source image\", chosen_color), end=\" \")\n",
" display(raw_image)\n",
" print(colored(f'{id}_2: Rendered image from LaTeX', chosen_color), end=\" \")\n",
" render_katex(pred)\n",
" print(colored(f'{id}_3: Predicted LaTeX code', chosen_color), end=\" \")\n",
" pred_text_panel = Panel.fit(pred, title=\"Predicted LaTeX\", border_style=chosen_color)\n",
" rprint(pred_text_panel)\n",
"\n",
"def render_katex(latex_string, show=True):\n",
" display(Math(latex_string))\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/bin/anaconda3/envs/unimernetv2_pip/lib/python3.10/site-packages/transformers/models/auto/image_processing_auto.py:510: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CustomVisionEncoderDecoderModel init\n",
"VariableUnimerNetModel init\n",
"VariableUnimerNetPatchEmbeddings init\n",
"VariableUnimerNetModel init\n",
"VariableUnimerNetPatchEmbeddings init\n",
"CustomMBartForCausalLM init\n",
"CustomMBartDecoder init\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/bin/anaconda3/envs/unimernetv2_pip/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:540: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
" warnings.warn(\n",
"/Users/bin/anaconda3/envs/unimernetv2_pip/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:545: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"color: #000080; text-decoration-color: #000080\"> Sample ID </span>┃<span style=\"color: #000080; text-decoration-color: #000080\"> Image Path </span>┃\n",
"┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
"│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 0 </span>│<span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> /Users/bin/code/GoGoGo/UniMERNet/asset/test_imgs/0000001.png </span>│\n",
"└──────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[34m \u001b[0m\u001b[34mSample ID \u001b[0m\u001b[34m \u001b[0m┃\u001b[34m \u001b[0m\u001b[34mImage Path \u001b[0m\u001b[34m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
"│\u001b[2m \u001b[0m\u001b[2m0 \u001b[0m\u001b[2m \u001b[0m│\u001b[2m \u001b[0m\u001b[2m/Users/bin/code/GoGoGo/UniMERNet/asset/test_imgs/0000001.png \u001b[0m\u001b[2m \u001b[0m│\n",
"└──────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[34m0_1: Source image\u001b[0m "
]
},
{
"data": {
"image/jpeg": "",
"image/png": "",
"text/plain": [
"<PIL.Image.Image image mode=L size=600x136>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[34m0_2: Rendered image from LaTeX\u001b[0m "
]
},
{
"data": {
"text/latex": [
"$\\displaystyle \\begin{array} { r l } { \\mathrm { M i n i m i s e ~ } } & { { } J ( u . ; s , y ) = \\mathbb { E } \\left[ \\int _ { s } ^ { T } \\left( u _ { t } ^ { 2 } + 1 \\right) d t - \\ln \\left( \\cosh \\left( X _ { T } \\right) \\right) \\right] } \\\\ { \\mathrm { s u b j e c t ~ t o ~ } } & { { } \\left\\{ \\begin{array} { l l } { d X _ { t } = 2 u _ { t } d t + \\sqrt { 2 } d W _ { t } , t \\in [ s , T ] } \\\\ { X _ { s } = y } \\\\ { u _ { t } \\in [ - 1 , 1 ] , \\quad t \\in [ s , T ] } \\end{array} \\right. } \\end{array}$"
],
"text/plain": [
"<IPython.core.display.Math object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[34m0_3: Predicted LaTeX code\u001b[0m "
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000080; text-decoration-color: #000080\">╭──────────────────────────────────────────────── Predicted LaTeX ────────────────────────────────────────────────╮</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> \\begin{array} { r l } { \\mathrm { M i n i m i s e ~ } } &amp; { { } J ( u . ; s , y ) = \\mathbb { E } \\left[ \\int _ <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> { s } ^ { T } \\left( u _ { t } ^ { 2 } + 1 \\right) d t - \\ln \\left( \\cosh \\left( X _ { T } \\right) \\right) <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> \\right] } \\\\ { \\mathrm { s u b j e c t ~ t o ~ } } &amp; { { } \\left\\{ \\begin{array} { l l } { d X _ { t } = 2 u _ <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> { t } d t + \\sqrt { 2 } d W _ { t } , t \\in [ s , T ] } \\\\ { X _ { s } = y } \\\\ { u _ { t } \\in [ - 1 , 1 ] , <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">│</span> \\quad t \\in [ s , T ] } \\end{array} \\right. } \\end{array} <span style=\"color: #000080; text-decoration-color: #000080\">│</span>\n",
"<span style=\"color: #000080; text-decoration-color: #000080\">╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[34m╭─\u001b[0m\u001b[34m───────────────────────────────────────────────\u001b[0m\u001b[34m Predicted LaTeX \u001b[0m\u001b[34m───────────────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n",
"\u001b[34m│\u001b[0m \\begin{array} { r l } { \\mathrm { M i n i m i s e ~ } } & { { } J ( u . ; s , y ) = \\mathbb { E } \\left[ \\int _ \u001b[34m│\u001b[0m\n",
"\u001b[34m│\u001b[0m { s } ^ { T } \\left( u _ { t } ^ { 2 } + 1 \\right) d t - \\ln \\left( \\cosh \\left( X _ { T } \\right) \\right) \u001b[34m│\u001b[0m\n",
"\u001b[34m│\u001b[0m \\right] } \\\\ { \\mathrm { s u b j e c t ~ t o ~ } } & { { } \\left\\{ \\begin{array} { l l } { d X _ { t } = 2 u _ \u001b[34m│\u001b[0m\n",
"\u001b[34m│\u001b[0m { t } d t + \\sqrt { 2 } d W _ { t } , t \\in [ s , T ] } \\\\ { X _ { s } = y } \\\\ { u _ { t } \\in [ - 1 , 1 ] , \u001b[34m│\u001b[0m\n",
"\u001b[34m│\u001b[0m \\quad t \\in [ s , T ] } \\end{array} \\right. } \\end{array} \u001b[34m│\u001b[0m\n",
"\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #000000; text-decoration-color: #000000\">───────────────────────────────────────────────────────────────────────────────────────────────────────────────────</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[30m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"root_path = os.path.abspath(os.getcwd())\n",
"config_path = os.path.join(root_path, \"configs/demo.yaml\")\n",
"image_directory = os.path.join(root_path, \"asset/test_imgs\")\n",
"\n",
"processor = ImageProcessor(config_path, image_directory)\n",
"\n",
"# Process a single image located at the specified path\n",
"processor.process_single_image(os.path.join(image_directory, '0000001.png'))\n",
"\n",
"# Uncomment the following line to process all images in the specified directory\n",
"# processor.process_images()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
import argparse
import os
import sys
import numpy as np
import cv2
import torch
from PIL import Image
sys.path.insert(0, os.path.join(os.getcwd(), ".."))
from unimernet.common.config import Config
import unimernet.tasks as tasks
from unimernet.processors import load_processor
class ImageProcessor:
def __init__(self, cfg_path):
self.cfg_path = cfg_path
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model, self.vis_processor = self.load_model_and_processor()
def load_model_and_processor(self):
args = argparse.Namespace(cfg_path=self.cfg_path, options=None)
cfg = Config(args)
task = tasks.setup_task(cfg)
model = task.build_model(cfg).to(self.device)
vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
return model, vis_processor
def process_single_image(self, image_path):
try:
raw_image = Image.open(image_path)
except IOError:
print(f"Error: Unable to open image at {image_path}")
return
# Convert PIL Image to OpenCV format
open_cv_image = np.array(raw_image)
# Convert RGB to BGR
if len(open_cv_image.shape) == 3:
# Convert RGB to BGR
open_cv_image = open_cv_image[:, :, ::-1].copy()
# Display the image using cv2
image = self.vis_processor(raw_image).unsqueeze(0).to(self.device)
output = self.model.generate({"image": image})
pred = output["pred_str"][0]
print(f'Prediction:\n{pred}')
cv2.imshow('Original Image', open_cv_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
return pred
if __name__ == "__main__":
root_path = os.path.abspath(os.getcwd())
config_path = os.path.join(root_path, "configs/demo.yaml")
processor = ImageProcessor(config_path)
# Process a single image located at the specified path
image_path = os.path.join(root_path, 'asset/test_imgs', '0000001.png')
latex_code = processor.process_single_image(image_path)
CustomVisionEncoderDecoderModel init
CustomMBartForCausalLM init
CustomMBartDecoder init
arch_name:unimernet
model_type:unimernet
checkpoint:
====================================================================================================
Device:cuda
Load model: 23.643s
len_gts:6762, len_preds=6762
norm_gts[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
norm_preds[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
Evaluation Set:Simple Print Expression(SPE)
Inference Time: 690.366847038269s
bleu ⬆ edit ⬇
-------- --------
0.917711 0.058556
====================================================================================================
len_gts:5921, len_preds=5921
norm_gts[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u,v,w,z,x}\},\{\boldsymbol{\kappa,\lambda,\mu,\nu}\})=\frac{1}{2}\|\mathbf{y-Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax-u}\|_{2}^{2}+\boldsymbol{\kappa}^{\top}(\mathbf{Ax-u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x-v}\|_{2}^{2}+\boldsymbol{\lambda}^{\top}(\mathbf{x-v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx-w}\|_{2}^{2}+\boldsymbol{\mu}^{\top}(\mathbf{Dx-w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x-z}\|_{2}^{2}+\boldsymbol{\nu}^{\top}(\mathbf{x-z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
norm_preds[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u},\mathbf{v},\mathbf{w},\mathbf{z},\mathbf{x}\},\{\kappa,\lambda,\mu,\nu\})=\frac{1}{2}\|\mathbf{y}-\mathbf{Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax}-\mathbf{u}\|_{2}^{2}+\kappa^{\top}(\mathbf{Ax}-\mathbf{u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x}-\mathbf{v}\|_{2}^{2}+\boldsymbol{\lambda}^{\top}(\mathbf{x}-\mathbf{v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx}-\mathbf{w}\|_{2}^{2}+\mu^{\top}(\mathbf{Dx}-\mathbf{w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x}-\mathbf{z}\|_{2}^{2}+\nu^{\top}(\mathbf{x}-\mathbf{z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
Evaluation Set:Complex Print Expression(CPE)
Inference Time: 2109.273235321045s
bleu ⬆ edit ⬇
-------- --------
0.913843 0.057546
====================================================================================================
len_gts:4742, len_preds=4742
norm_gts[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
norm_preds[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
Evaluation Set:Screen Capture Expression(SCE)
Inference Time: 2425.0937654972076s
bleu ⬆ edit ⬇
-------- --------
0.617196 0.22953
====================================================================================================
len_gts:6332, len_preds=6332
norm_gts[0]:b_{n+1}-b_{n}=-1
norm_preds[0]:b_{n+1}-b_{n}=-1
Evaluation Set:Handwritten Expression(HWE)
Inference Time: 2543.7626719474792s
bleu ⬆ edit ⬇
-------- ---------
0.920905 0.0546271
====================================================================================================
CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init
arch_name:unimernet
model_type:unimernet
checkpoint:
====================================================================================================
Device:cuda
Load model: 25.267s
len_gts:6762, len_preds=6762
norm_gts[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
norm_preds[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
Evaluation Set:Simple Print Expression(SPE)
Inference Time: 561.8906960487366s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.915231 0.901589 0.0600379
====================================================================================================
len_gts:5921, len_preds=5921
norm_gts[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u,v,w,z,x}\},\{\boldsymbol{\kappa,\lambda,\mu,\nu}\})=\frac{1}{2}\|\mathbf{y-Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax-u}\|_{2}^{2}+\boldsymbol{\kappa}^{\top}(\mathbf{Ax-u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x-v}\|_{2}^{2}+\boldsymbol{\lambda}^{\top}(\mathbf{x-v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx-w}\|_{2}^{2}+\boldsymbol{\mu}^{\top}(\mathbf{Dx-w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x-z}\|_{2}^{2}+\boldsymbol{\nu}^{\top}(\mathbf{x-z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
norm_preds[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u},\mathbf{v},\mathbf{w},\mathbf{z},\mathbf{x}\},\{\kappa,\lambda,\mu,\nu\})=\frac{1}{2}\|\mathbf{y}-\mathbf{Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax}-\mathbf{u}\|_{2}^{2}+\kappa^{\top}(\mathbf{Ax}-\mathbf{u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x}-\mathbf{v}\|_{2}^{2}+\lambda^{\top}(\mathbf{x}-\mathbf{v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx}-\mathbf{w}\|_{2}^{2}+\mu^{\top}(\mathbf{Dx}-\mathbf{w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x}-\mathbf{z}\|_{2}^{2}+\nu^{\top}(\mathbf{x}-\mathbf{z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
Evaluation Set:Complex Print Expression(CPE)
Inference Time: 1556.0770707130432s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.924907 0.901307 0.0561145
====================================================================================================
len_gts:4742, len_preds=4742
norm_gts[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
norm_preds[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
Evaluation Set:Screen Capture Expression(SCE)
Inference Time: 1822.9997293949127s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- --------
0.626271 0.677379 0.223768
====================================================================================================
len_gts:6332, len_preds=6332
norm_gts[0]:b_{n+1}-b_{n}=-1
norm_preds[0]:b_{n+1}-b_{n}=-1.
Evaluation Set:Handwritten Expression(HWE)
Inference Time: 1900.8741807937622s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.894818 0.853878 0.0716135
====================================================================================================
\ No newline at end of file
CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init
arch_name:unimernet
model_type:unimernet
checkpoint:
====================================================================================================
Device:cuda
Load model: 14.039s
len_gts:6762, len_preds=6762
norm_gts[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
norm_preds[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
Evaluation Set:Simple Print Expression(SPE)
Inference Time: 489.57221508026123s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.913463 0.89984 0.0614923
====================================================================================================
len_gts:5921, len_preds=5921
norm_gts[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u,v,w,z,x}\},\{\boldsymbol{\kappa,\lambda,\mu,\nu}\})=\frac{1}{2}\|\mathbf{y-Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax-u}\|_{2}^{2}+\boldsymbol{\kappa}^{\top}(\mathbf{Ax-u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x-v}\|_{2}^{2}+\boldsymbol{\lambda}^{\top}(\mathbf{x-v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx-w}\|_{2}^{2}+\boldsymbol{\mu}^{\top}(\mathbf{Dx-w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x-z}\|_{2}^{2}+\boldsymbol{\nu}^{\top}(\mathbf{x-z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
norm_preds[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u,v,w,z,x}\},\{\kappa,\lambda,\mu,\nu\})=\frac{1}{2}\|\mathbf{y-Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax-u}\|_{2}^{2}+\kappa^{\top}(\mathbf{Ax-u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x-v}\|_{2}^{2}+\lambda^{\top}(\mathbf{x-v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx-w}\|_{2}^{2}+\mu^{\top}(\mathbf{Dx-w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x-z}\|_{2}^{2}+\nu^{\top}(\mathbf{x-z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
Evaluation Set:Complex Print Expression(CPE)
Inference Time: 1294.0490927696228s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.919805 0.895851 0.0597243
====================================================================================================
len_gts:4742, len_preds=4742
norm_gts[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
norm_preds[0]:F_{i}[z](x,y)=f_{i}(x,y,z)~i=1,\ldots,n,
Evaluation Set:Screen Capture Expression(SCE)
Inference Time: 1570.1024556159973s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- --------
0.617545 0.678047 0.22849
====================================================================================================
len_gts:6332, len_preds=6332
norm_gts[0]:b_{n+1}-b_{n}=-1
norm_preds[0]:b_{n+1}-b_{n}=-1
Evaluation Set:Handwritten Expression(HWE)
Inference Time: 1641.9185965061188s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.889342 0.850023 0.0748587
================================
\ No newline at end of file
CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init
arch_name:unimernet
model_type:unimernet
checkpoint:
====================================================================================================
Device:cuda
Load model: 10.674s
len_gts:6762, len_preds=6762
norm_gts[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
norm_preds[0]:S\sim\tilde{\psi}Q_{o}\tilde{\psi}+g_{s}^{1/2}\tilde{\psi}^{3}+\tilde{\phi}Q_{c}\tilde{\phi}+g_{s}\tilde{\phi}^{3}+\tilde{\phi}B(g_{s}^{1/2}\tilde{\psi})+\cdots.
Evaluation Set:Simple Print Expression(SPE)
Inference Time: 437.5355474948883s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.909031 0.895183 0.0661603
====================================================================================================len_gts:5921, len_preds=5921
len_gts:5921, len_preds=5921
norm_gts[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u,v,w,z,x}\},\{\boldsymbol{\kappa,\lambda,\mu,\nu}\})=\frac{1}{2}\|\mathbf{y-Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax-u}\|_{2}^{2}+\boldsymbol{\kappa}^{\top}(\mathbf{Ax-u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x-v}\|_{2}^{2}+\boldsymbol{\lambda}^{\top}(\mathbf{x-v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx-w}\|_{2}^{2}+\boldsymbol{\mu}^{\top}(\mathbf{Dx-w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x-z}\|_{2}^{2}+\boldsymbol{\nu}^{\top}(\mathbf{x-z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
norm_preds[0]:\begin{array}{r l}{\mathcal{L}(\{\mathbf{u},\mathbf{v},\mathbf{w},\mathbf{z},\mathbf{x}\},\{\mathbf{x},\lambda,\mu,\nu\})=\frac{1}{2}\|\mathbf{y}-\mathbf{Cu}\|_{2}^{2}}&{+\tau_{1}\|\mathbf{v}\|_{1}+\tau_{2}\|\mathbf{w}\|_{1}}\\ &{+\frac{\rho_{1}}{2}\|\mathbf{Ax}-\mathbf{u}\|_{2}^{2}+\kappa^{\top}(\mathbf{Ax}-\mathbf{u})}\\ &{+\frac{\rho_{2}}{2}\|\mathbf{x}-\mathbf{v}\|_{2}^{2}+\lambda^{\top}(\mathbf{x}-\mathbf{v})}\\ &{+\frac{\rho_{3}}{2}\|\mathbf{Dx}-\mathbf{w}\|_{2}^{2}+\mu^{\top}(\mathbf{Dx}-\mathbf{w})}\\ &{+\frac{\rho_{4}}{2}\|\mathbf{x}-\mathbf{z}\|_{2}^{2}+\nu^{\top}(\mathbf{x}-\mathbf{z})}\\ &{+\mathcal{I}_{+}(\mathbf{z})}\end{array}
Evaluation Set:Complex Print Expression(CPE)
Inference Time: 1043.2925176620483s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.902193 0.876609 0.0746548
====================================================================================================
len_gts:4742, len_preds=4742
norm_gts[0]:F_{i}[z](x,y)=f_{i}(x,y,z)\ i=1,\ldots,n,
norm_preds[0]:F_{i}[z](x,y)=f_{i}(x,y,z)~i=1,\ldots,n,
Evaluation Set:Screen Capture Expression(SCE)
Inference Time: 1415.7678081989288s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- --------
0.56585 0.672292 0.238716
====================================================================================================
len_gts:6332, len_preds=6332
norm_gts[0]:b_{n+1}-b_{n}=-1
norm_preds[0]:b_{n+1}-b_{n}=-1
Evaluation Set:Handwritten Expression(HWE)
Inference Time: 1480.4430103302002s
bleu ⬆ meteor ⬆ edit ⬇
-------- ---------- ---------
0.883151 0.845897 0.0783475
====================================================================================================
# 模型唯一标识
modelCode=1820
# 模型名称
modelName=unimernet_transformers
# 模型描述
modelDescription=UniMERNet是一款专为数学公式识别设计的深度学习模型,支持将手写或打印的数学公式图像转换为LaTeX代码,适用于多种应用场景
processType=推理
# 算法类别
appScenario=文本生成
# 框架类型
frameType=vllm
# 加速卡类型
accelerateType=K100AI
\ No newline at end of file
Put [model files](https://huggingface.co/wanderkid/unimernet/tree/main) here:
unimernet
├── README.md
├── config.json
├── preprocessor_config.json
├── pytorch_model.bin
├── tokenizer.json
└── tokenizer_config.json
\ No newline at end of file
[tool.poetry]
name = "unimernet"
version = "0.2.3"
description = 'UniMERNet: A Universal Network for Real-World Mathematical Expression Recognition'
authors = ["Bin Wang <ictwangbin@gmail.com>"]
readme = "README.md"
license = "Apache License 2.0"
repository = "https://github.com/opendatalab/UniMERNet"
keywords = ["MER", "latex", "markdown", "pdf"]
include = [
"train.py",
"test.py",
"demo.py",
"unimernet_app.py",
"run_unimernet_app.py",
]
[tool.poetry.dependencies]
python = ">=3.10"
torch = ">=2.2.2"
torchvision = ">=0.17.2"
omegaconf = "^2.3.0"
matplotlib = "^3.8.4"
iopath = "^0.1.9"
timm = "^0.9.16"
opencv-python = "^4.6.0"
transformers = "4.42.4"
fairscale = "^0.4.13"
# ftfy = "^6.2.0"
ftfy = {version = "^6.2.0", python = ">=3.10,<4.0"}
albumentations = "^1.4.4"
wand = "^0.6.13"
webdataset = "^0.2.86"
rapidfuzz = "^3.8.1"
termcolor = "^2.4.0"
pandas = "^2.2.2"
evaluate = "^0.4.1"
rich = "^13.7.1"
jupyterlab = "^4.1.6"
tabulate = "^0.9.0"
nltk = "^3.8.1"
streamlit = "^1.33.0"
pypdfium2 = "^4.29.0"
pdf2image = "^1.17.0"
streamlit_drawable_canvas = "^0.9.3"
[tool.poetry.extras]
full = [
"termcolor",
"pandas",
"rich",
"jupyterlab",
"tabulate",
"nltk",
"streamlit",
"pypdfium2",
"pdf2image",
"streamlit_drawable_canvas"
]
[tool.poetry.scripts]
unimernet = "demo:main"
unimernet_gui = "run_unimernet_app:run_app"
unimernet_eval = "test:main"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment