"test/vscode:/vscode.git/clone" did not exist on "64956d54b2b2be9681da01fe214277a0eef05be2"
Commit f49cb0ae authored by luopl's avatar luopl
Browse files

"Initial commit"

parents
Pipeline #3091 canceled with stages
from transformers import AutoProcessor
from transformers import HunYuanVLForConditionalGeneration
from PIL import Image
import numpy as np
import requests
import torch
import base64
import requests
from io import BytesIO
def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
if n<8000:
return text
for length in range(2, n // 10 + 1):
candidate = text[-length:]
count = 0
i = n - length
while i >= 0 and text[i:i + length] == candidate:
count += 1
i -= length
if count >= 10:
return text[:n - length * (count - 1)]
return text
def get_image(input_source):
if input_source.startswith(('http://', 'https://')):
response = requests.get(input_source)
response.raise_for_status()
return Image.open(BytesIO(response.content))
else:
return Image.open(input_source)
def main():
model_name_or_path = "tencent/HunyuanOCR"
processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
img_path = "./assets/tools-dark.png"
messages1 = [
{
"role": "user",
"content": [
{"type": "image", "image": img_path},
{"type": "text", "text": (
"提取文档图片中正文的所有信息用markdown格式表示,其中页眉、页脚部分忽略,表格用html格式表达,文档中公式用latex格式表示,按照阅读顺序组织进行解析。"
)},
],
}
]
messages = [messages1]
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
image_inputs = get_image(img_path)
inputs = processor(
text=texts,
images=image_inputs,
padding=True,
return_tensors="pt",
)
model = HunYuanVLForConditionalGeneration.from_pretrained(
model_name_or_path,
attn_implementation="eager",
dtype=torch.bfloat16,
device_map="auto"
)
with torch.no_grad():
device = next(model.parameters()).device
inputs = inputs.to(device)
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
if "input_ids" in inputs:
input_ids = inputs.input_ids
else:
print("inputs: # fallback", inputs)
input_ids = inputs.inputs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
]
output_texts = clean_repeated_substrings(processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
))
print(output_texts)
if __name__ == '__main__':
main()
"""
Post-process for OminiDocBench
Because HunYuanOCR is end-to-end parsing and ignores the restrictions of the pre-layout on the panel categories,
the model's parsing results are diverse.
Quick match may have about 8% false matches. We adopted a hierarchical paradigm:
- Edit distance < 0.4:
We consider this type of match a correct match and directly use it as Form_part1.
- Edit distance >= 0.4:
We believe this case may be caused by model parsing failure or incorrect matching.
We adjust the match through a simple automated post-processing + manual post-processing paradigm.
"""
import json
import re
import os
from difflib import SequenceMatcher
from collections import Counter
# ======================= Tool Functions =======================
def remove_big_braces(s: str):
pattern = r'\\(big|Big|bigg|Bigg)\{([^\}]+)\}'
repl = r'\\\1\2'
return re.sub(pattern, repl, s)
def process_final_ans(final_ans):
for item in final_ans:
if "pred" in item and isinstance(item["pred"], str):
item["pred"] = remove_big_braces(item["pred"])
return final_ans
def clean_gt_tail(gt: str):
pattern = r'(\\quad+|\\qquad+)\s*\{?\(\s*\d+\s*\)\}?\s*$'
return re.sub(pattern, '', gt).rstrip()
def load_instances(jsonl_path):
instances = []
with open(jsonl_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
instances.append(obj)
except json.JSONDecodeError as e:
print(f"[WARN] Parsing failed: {e}, skipped")
print(f"[INFO] Read {len(instances)} markdown instances from {jsonl_path}" )
return instances
def count_id_distribution(error_match_ans):
"""
Perform distribution statistics on the id field of error_match_ans (intermediate output)
"""
id_list = [item['id'] for item in error_match_ans if 'id' in item]
counter = Counter(id_list)
print("\n===============================")
print("📌 ID field distribution statistics (intermediate output)")
print("===============================")
print("Total number of records:", len(error_match_ans))
print("Number of records containing id:", len(id_list))
print("\n=== Distribution of the id field ===")
for id_val, cnt in counter.most_common():
print(f"id={id_val} : {cnt} 次")
print("===============================\n")
return counter
# ---------- 匹配工具 ----------
def normalize_for_match(text: str):
text = re.sub(r"\\textcircled\{a\}", "ⓐ", text)
text = re.sub(r"\\textcircled\{b\}", "ⓑ", text)
text = re.sub(r"\\textcircled\{c\}", "ⓒ", text)
text = re.sub(r"\\textcircled\{d\}", "ⓓ", text)
text = text.replace("\\text{ⓐ}", "ⓐ")
text = text.replace("\\text{ⓑ}", "ⓑ")
text = text.replace("\\text{ⓒ}", "ⓒ")
text = text.replace("\\text{ⓓ}", "ⓓ")
text = text.replace(" ", "")
return text
def clean_formula(text: str):
return (text.replace("\\quad", "")
.replace("$", "")
.strip())
def extract_candidates(markdown: str):
lines = markdown.split("\n")
candidates = []
for line in lines:
line = line.strip()
if not line:
continue
line = re.sub(r"^\s*\d+\.\s*", "", line)
cleaned = clean_formula(line)
if cleaned:
candidates.append(cleaned)
return candidates
def best_match(gt: str, candidates):
gt_norm = normalize_for_match(gt)
best_score = -1
best_cand = None
for cand in candidates:
cand_norm = normalize_for_match(cand)
score = SequenceMatcher(None, gt_norm, cand_norm).ratio()
if score > best_score:
best_score = score
best_cand = cand
return best_cand, best_score
def process_badcases(Form_part2):
results = []
for case in Form_part2:
markdown = case["markdown"]
gt = case["gt"]
candidates = extract_candidates(markdown)
pred, score = best_match(gt, candidates)
pred = pred.replace("\\text{ⓐ}","\\textcircled{a}") \
.replace("\\text{ⓑ}","\\textcircled{b}") \
.replace("\\text{ⓒ}","\\textcircled{c}") \
.replace("\\text{ⓓ}","\\textcircled{d}") \
.replace("ⓐ","\\textcircled{a}") \
.replace("ⓑ","\\textcircled{b}") \
.replace("ⓒ","\\textcircled{c}") \
.replace("ⓓ","\\textcircled{d}")
results.append({
'img_id': case['img_id'],
"gt": gt,
"pred": pred,
"match_score": score
})
return results
# ======================= Main Function =======================
def process_formula_matching(match_file, markdown_file, markdown_key, output_file):
# ----------- Step1: Read the matching result file -----------
with open(match_file, "r", encoding="utf-8") as f:
raw_data = json.load(f)
final_ans = []
for idx, item in enumerate(raw_data):
ref = item['gt'].replace('$', '') \
.replace('\[', '').replace('\]','') \
.replace(',', ',') \
.strip()
pred = item['pred'].replace('$', '') \
.replace('\[', '').replace('\]','') \
.replace(',', ',') \
.strip()
final_ans.append({
'img_id': f"{idx}",
'id': f"{item['img_id']}",
'gt': ref,
'pred': pred,
'edit': item['edit']
})
final_ans = process_final_ans(final_ans)
# ----------- Step2: Split Form_part1 / error_match -----------
Form_part1 = []
error_match_ans = []
for item in final_ans:
item['pred'] = clean_gt_tail(item['pred'])
if item['edit'] < 0.4:
Form_part1.append(item)
else:
error_match_ans.append(item)
distribution = count_id_distribution(error_match_ans)
# ----------- Step3: Write markdown into error_match_ans -----------
markdown_data = load_instances(markdown_file)
for item in markdown_data:
basename = os.path.basename(item['image_path'][0])
for seq in error_match_ans:
if basename == seq['id']:
seq['markdown'] = item[markdown_key]
# ----------- Step4: Special case handling for Form_part2 (id points to a specific file) -----------
Form_part2 = [
x for x in error_match_ans
if x['id'] == "yanbaopptmerge_9081a70ff98b3e7d640660a9412c447d.pdf_1287.jpg"
]
# Matching bad samples
out = process_badcases(Form_part2)
# ----------- Step5: For regular error matching, substring rules are used directly. -----------
Form_part3 = []
for item in error_match_ans:
if item['id'] == "yanbaopptmerge_9081a70ff98b3e7d640660a9412c447d.pdf_1287.jpg":
continue
gt = item['gt'].replace(' ', '')
answer = item['markdown'].replace('$','').replace(' ','')
if gt in answer:
item['pred'] = item['gt']
Form_part3.append(item)
# ----------- Step6: Combine all results and output. -----------
merge_form = Form_part1 + out + Form_part3
with open(output_file, "w", encoding="utf-8") as f:
json.dump(merge_form, f, ensure_ascii=False, indent=4)
print(f"[DONE] Saved final result to {output_file}")
# ======================= END =======================
if __name__ == "__main__":
process_formula_matching(
match_file="vllm_omni_quick_match_display_formula_result.json",# Omnidocbenh quick match formula matching results
markdown_file="OCR_OmniDocbench_vllm_infer_res.jsonl", # parsing result jsonl from vllm
markdown_key="vllm_answer_eth", # answer key
output_file="Final_formula.json" #The output file will be evaluated using the same method described in https://github.com/opendatalab/UniMERNet/tree/main/cdm.
)
import json
import base64
from openai import OpenAI
from tqdm import tqdm
from typing import Dict, List
def encode_image(image_path: str) -> str:
"""
Encode image file to base64 string.
Args:
image_path: Path to the image file
Returns:
Base64 encoded string of the image
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def create_chat_messages(image_path: str, prompt: str) -> List[Dict]:
"""
Create chat messages with image and prompt.
Args:
image_path: Path to the image file
prompt: Text prompt for the model
Returns:
List of message dictionaries
"""
return [
{"role": "system", "content": ""},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(image_path)}"
}
},
{"type": "text", "text": prompt}
]
}
]
def process_single_item(client: OpenAI, data: Dict) -> Dict:
"""
Process a single data item through the VLLM API.
Args:
client: OpenAI client instance
data: Input data dictionary
Returns:
Updated data dictionary with model response
"""
# Extract image path and prompt
img_path = data['image_path']
prompt = data['question']
# Create chat messages
messages = create_chat_messages(img_path, prompt)
# Get model response
response = client.chat.completions.create(
model="tencent/HunyuanOCR",
messages=messages,
temperature=0.0,
top_p=0.95,
seed=1234,
stream=False,
extra_body={
"top_k": 1,
"repetition_penalty": 1.0
}
)
# Update data with model response
data["vllm_answer"] = response.choices[0].message.content
return data
def main():
"""Main function to process the JSONL file through VLLM API"""
# Initialize OpenAI client
client = OpenAI(
api_key="EMPTY",
base_url="http://localhost:8000/v1",
timeout=3600
)
# Define input/output paths
input_path = 'ominidoc_bench.jsonl'
output_path = "infer_result_ominidoc_bench.jsonl"
# Process data
with open(input_path, "r", encoding="utf-8") as fin, \
open(output_path, "w", encoding="utf-8") as fout:
# Iterate through input file
for line in tqdm(fin, desc="Processing documents"):
if not line.strip():
continue
try:
# Load and process data
data = json.loads(line)
processed_data = process_single_item(client, data)
# Write results
fout.write(json.dumps(processed_data, ensure_ascii=False) + "\n")
except Exception as e:
print(f"Error processing line: {str(e)}")
continue
print(f"Processing completed. Results saved to: {output_path}")
if __name__ == "__main__":
main()
import re
import os
import json
import random
from typing import Tuple, List, Dict
import numpy as np
from PIL import Image, ImageDraw, ImageFont
def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
if n<8000:
return text
for length in range(2, n // 10 + 1):
candidate = text[-length:]
count = 0
i = n - length
while i >= 0 and text[i:i + length] == candidate:
count += 1
i -= length
if count >= 10:
return text[:n - length * (count - 1)]
return text
def norm_formula_HYOCR(gt_form,pre_form):
"""
NORMARLIZE for OmniDocBench.input:gt_form,pred_form(from quick_match_display_formula)
# End-to-end document parsing bypasses layout constraints and category definitions,
# yet the HunyuanOCR model produces outputs with more flexible granularities.
# For certain mathematical expressions, HunyuanOCR may parse them using a mix of plain texts and embedded LaTeX formulas.
# This can lead to mismatches when evaluated with OmniDocBench’s quick-match mechanism.
# Therefore, we manually correct cases with incorrect matches (i.e., those with an edit-distance score greater than 0.5).
# Below is a piece of logic code used to handle some normalization cases.
"""
def clean_gt_tail(gt: str):
pattern = r'(\\quad+|\\qquad+)\s*\{?\(\s*\d+\s*\)\}?\s*$'
return re.sub(pattern, '', gt).rstrip()
pre_form = clean_gt_tail(pre_form)
gt_form = gt_form.replace('\[', '').replace('\]', '').replace(',', ',').strip()
pre_form = pre_form.replace('\[', '').replace('\]', '').replace(',', ',').strip()
pre_form = pre_form.replace("\\text{ⓐ}", "\\textcircled{a}") \
.replace("\\text{ⓑ}", "\\textcircled{b}") \
.replace("\\text{ⓒ}", "\\textcircled{c}") \
.replace("\\text{ⓓ}", "\\textcircled{d}")
return gt_form, pre_form
def parse_coords(coord_str: str) -> Tuple[float, float]:
"""Parse coordinate string and return (x,y) tuple"""
try:
x, y = coord_str.strip('()').split(',')
return (float(x), float(y))
except:
return (0, 0)
def denormalize_coordinates(coord: Tuple[float, float], image_width: int, image_height: int) -> Tuple[int, int]:
"""Denormalize coordinates from [0,1000] to image dimensions"""
x, y = coord
denorm_x = int(x * image_width / 1000)
denorm_y = int(y * image_height / 1000)
return (denorm_x, denorm_y)
def process_spotting_response(response: str, image_width: int, image_height: int) -> str:
"""Process spotting task response and denormalize coordinates"""
try:
# Find all text and coordinate pairs using regex
pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
matches = re.finditer(pattern, response)
new_response = response
for match in matches:
text = match.group(1).strip()
coords = match.group(2)
# Parse the two coordinate points
coord_pattern = r'\((\d+),(\d+)\)'
coord_matches = re.findall(coord_pattern, coords)
if len(coord_matches) == 2:
start_coord = (float(coord_matches[0][0]), float(coord_matches[0][1]))
end_coord = (float(coord_matches[1][0]), float(coord_matches[1][1]))
# Denormalize coordinates
denorm_start = denormalize_coordinates(start_coord, image_width, image_height)
denorm_end = denormalize_coordinates(end_coord, image_width, image_height)
# Build new coordinate string
new_coords = f"({denorm_start[0]},{denorm_start[1]}),({denorm_end[0]},{denorm_end[1]})"
# Replace coordinates in original response
new_response = new_response.replace(coords, new_coords)
return new_response
except Exception as e:
print(f"Error processing response: {str(e)}")
return response
def draw_text_detection_boxes(image: Image, response: str) -> Image:
"""Draw text detection boxes on image"""
image_width, image_height = image.size
img_draw = image.copy()
draw = ImageDraw.Draw(img_draw)
# Create transparent overlay
overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
draw_overlay = ImageDraw.Draw(overlay)
try:
font = ImageFont.load_default()
except IOError:
font = ImageFont.load_default()
# Extract text and coordinates using regex
pattern = r'([^()]+)(\(\d+,\d+\),\(\d+,\d+\))'
matches = re.finditer(pattern, response)
for match in matches:
try:
text = match.group(1).strip()
coords = match.group(2)
# Parse coordinates
coord_pattern = r'\((\d+),(\d+)\)'
coord_matches = re.findall(coord_pattern, coords)
if len(coord_matches) == 2:
x1, y1 = int(coord_matches[0][0]), int(coord_matches[0][1])
x2, y2 = int(coord_matches[1][0]), int(coord_matches[1][1])
# Generate random color
color = (np.random.randint(0, 200),
np.random.randint(0, 200),
np.random.randint(0, 255))
color_alpha = color + (20,)
# Draw rectangle and overlay
draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
draw_overlay.rectangle([x1, y1, x2, y2],
fill=color_alpha,
outline=(0, 0, 0, 0))
# Draw text label
text_x = x1
text_y = max(0, y1 - 15)
text_bbox = draw.textbbox((0, 0), text, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
draw.rectangle([text_x, text_y,
text_x + text_width, text_y + text_height],
fill=(255, 255, 255, 30))
draw.text((text_x, text_y), text, font=font, fill=color)
except Exception as e:
print(f"Error drawing box: {str(e)}")
continue
# Combine image with overlay
img_draw.paste(overlay, (0, 0), overlay)
return img_draw
def main():
"""Main function to process images and visualize results"""
# Read JSONL file
jsonl_path = "/path/to/test_data.jsonl"
output_dir = "output_visualizations"
image_root = "/path/to/image_root"
os.makedirs(output_dir, exist_ok=True)
# Read all lines from JSONL
items = []
with open(jsonl_path, 'r') as f:
for line in f:
items.append(json.loads(line.strip()))
# Randomly select one item
item = random.choice(items)
# Get image path and response
image_path = os.path.join(image_root, item["image_name"])
response = clean_repeated_substrings(item["vllm-infer"])
print(f"Processing image: {item['image_name']}")
# Load and process image
image = Image.open(image_path)
image_width, image_height = image.size
# Process response coordinates
processed_response = process_spotting_response(response, image_width, image_height)
print("Original response:", response)
print("Processed response:", processed_response)
# Draw detection boxes
result_image = draw_text_detection_boxes(image, processed_response)
# Save result using original image name
output_path = os.path.join(output_dir, item["image_name"])
result_image.save(output_path)
print(f"Visualization saved to {output_path}")
if __name__ == "__main__":
main()
TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
Tencent HunyuanOCR Release Date: November 25, 2025
THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
1. DEFINITIONS.
a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
i. “Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanOCR released at [https://huggingface.co/tencent/HunyuanOCR].
k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
n. “including” shall mean including but not limited to.
2. GRANT OF RIGHTS.
We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
3. DISTRIBUTION.
You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
b. You must cause any modified files to carry prominent notices stating that You changed the files;
c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
e. In the event that You use, integrate, implement, or otherwise deploy the Tencent Hunyuan Works, in whole or in part, to provide, enable, or support any service, product, or functionality to third parties, You shall clearly, accurately, and prominently disclose to all end users the full legal name and entity of the actual provider of such service, product, or functionality. You shall expressly and conspicuously state that Tencent is not affiliated with, associated with, sponsoring, or endorsing any such service, product, or functionality. You shall not use or display any name, logo, trademark, trade name, or other indicia of Tencent in any manner that could be construed as, or be likely to create, confusion, deception, or a false impression regarding any relationship, affiliation, sponsorship, or endorsement by Tencent.
You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
4. ADDITIONAL COMMERCIAL TERMS.
If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
5. RULES OF USE.
a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
6. INTELLECTUAL PROPERTY.
a. Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
8. SURVIVAL AND TERMINATION.
a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
9. GOVERNING LAW AND JURISDICTION.
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
EXHIBIT A
ACCEPTABLE USE POLICY
Tencent reserves the right to update this Acceptable Use Policy from time to time.
Last modified: November 5, 2024
Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
1. Outside the Territory;
2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
3. To harm Yourself or others;
4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
5. To override or circumvent the safety guardrails and safeguards We have put in place;
6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
9. To intentionally defame, disparage or otherwise harass others;
10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
11. To generate or disseminate personal identifiable information with the purpose of harming others;
12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
13. To impersonate another individual without consent, authorization, or legal right;
14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
19. For military purposes;
20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
# HunyuanOCR
## 论文
[HunyuanOCR Technical Report](https://arxiv.org/abs/2511.19575)
## 模型简介
**HunyuanOCR**是一款基于腾讯混元原生多模态架构的端到端OCR专家模型。仅以**1B**轻量化参数,便已斩获多项业界SOTA成绩。该模型精通**复杂多语种文档解析**,同时在**文字检测识别、开放字段信息抽取、视频字幕识别、拍照翻译**等全场景实用技能中表现出色。
✨ 核心特点
- 💪 **轻量化架构**:基于混元原生多模态架构与训练策略,打造仅1B参数的OCR专项模型,大幅降低部署成本。
- 📑 **全场景功能**:单一模型覆盖文字检测和识别、复杂文档解析、卡证票据字段抽取、字幕提取等OCR经典任务,更支持端到端拍照翻译与文档问答。
- 🚀 **极致易用**:深度贯彻大模型"端到端"理念,单一指令、单次推理直达SOTA结果,较业界级联方案更高效便捷。
- 🌏 **多语种支持**:支持超过100种语言,在单语种和混合语言场景下均表现出色。
<div align="left">
<img src="./assets/hyocr-pipeline-v1.png">
</div>
## 环境依赖
| 软件 | 版本 |
|:------------:| :------: |
| DTK | 25.04.2 |
| python | 3.10.12 |
| transformers | 4.57.1 |
| torch | 2.5.1+das.opt1.dtk25042 |
| accelerate | 1.11.0 |
| torchvision | 0.20.1+das.opt1.dtk25042 |
推荐使用镜像:
- 挂载地址`-v``{docker_name}``{docker_image_name}`根据实际模型情况修改
```bash
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_path/:/path/your_code_path/ -v /opt/hyhal/:/opt/hyhal/:ro {docker_image_name} bash
示例如下:
docker run -it --shm-size 200g --network=host --name qwen3vl --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_path/:/path/your_code_path/ -v /opt/hyhal/:/opt/hyhal/:ro image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.2-py3.10 bash
#视频推理时安装PyAV后端依赖
pip install git+https://github.com/huggingface/transformers@82a06db03535c49aa987719ed0746a76093b1ec4
```
更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。
关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装,其它包参照requirements.txt安装:
```
pip install -r requirements.txt
```
## 数据集
`暂无`
## 训练
`暂无`
## 推理
### transformers
#### 单机推理
```bash
HIP_VISIBLE_DEVICES=0 python Hunyuan-OCR-master/Hunyuan-OCR-hf/run_hy_ocr.py
```
## 效果展示
**Input:**
- image:
<div align=center>
<img src="./assets/tools-dark.png"/>
</div>
- text: "提取文档图片中正文的所有信息用markdown格式表示,其中页眉、页脚部分忽略,表格用html格式表达,文档中公式用latex格式表示,按照阅读顺序组织进行解析。"
Output:
<div align=center>
<img src="./assets/result.png"/>
</div>
### 精度
`DCU与GPU精度一致,推理框架:transformers。`
## 预训练权重
| 模型名称 | 权重大小 | DCU型号 | 最低卡数需求 |下载地址|
|:--------------------:|:----:|:----------:|:------:|:----------:|
| HunyuanOCR | 1B | BW1000| 1 | [Hugging Face](https://huggingface.co/tencent/HunyuanOCR) |
## 源码仓库及问题反馈
- https://developer.sourcefind.cn/codes/modelzoo/hunyuanocr_pytorch
## 参考资料
- https://github.com/Tencent-Hunyuan/HunyuanOCR
\ No newline at end of file
<div align="center">
[中文阅读](./README_zh.md)
</div>
<div align="center">
# HunyuanOCR
</div>
<p align="center">
<img src="./assets/hyocr-head-img.png" width="80%"/> <br>
</p>
<p align="center">
<a href="https://huggingface.co/spaces/tencent/HunyuanOCR"><b>🎯 Demo</b></a> |
<a href="https://huggingface.co/tencent/HunyuanOCR"><b>📥 Model Download</b></a> |
<a href="https://arxiv.org/abs/2511.19575"><b>📄 Technical Report</b></a>
</p>
## 🤝 Join Our Community
<div align="center">
| Wechat Discussion Group | Discord Group |
| :---: | :---: |
| <img src="./assets/qrcode_for_hunyuanocr_wechat.jpg" width="150"> | [Join HunyuanOCR Discord](https://discord.gg/XeD3p2MRDk) |
</div>
## 🔥 News
- **[2025/11/28]** 🛠️ We fixed vLLM inference bugs and hyperparameter configuration issues such as system prompt. It is recommended to use the latest vLLM installation steps and the [inference script](https://github.com/Tencent-Hunyuan/HunyuanOCR/blob/main/Hunyuan-OCR-master/Hunyuan-OCR-vllm/run_hy_ocr.py) for performance testing. Currently, there is still a certain accuracy difference between Transformers and the vLLM framework (we are working on fixing this).
- **[2025/11/25]** 📝 Inference code and model weights publicly available.
## 📖 Introduction
**HunyuanOCR** stands as a leading end-to-end OCR expert VLM powered by Hunyuan's native multimodal architecture. With a remarkably lightweight 1B parameter design, it has achieved multiple state-of-the-art benchmarks across the industry. The model demonstrates mastery in **complex multilingual document parsing** while excelling in practical applications including **text spotting, open-field information extraction, video subtitle extraction, and photo translation**.
## ✨ Key Features
- 💪 **Efficient Lightweight Architecture**: Built on Hunyuan's native multimodal architecture and training strategy, achieving SOTA performance with only 1B parameters, significantly reducing deployment costs.
- 📑 **Comprehensive OCR Capabilities**: A single model covering classic OCR tasks including text detection and recognition, complex document parsing, open-field information extraction and video subtitle extraction, while supporting end-to-end photo translation and document QA.
- 🚀 **Ultimate Usability**: Deeply embraces the "end-to-end" philosophy of large models - achieving SOTA results with single instruction and single inference, offering greater efficiency and convenience compared to industry cascade solutions.
- 🌏 **Extensive Language Support**: Robust support for over 100 languages, excelling in both single-language and mixed-language scenarios across various document types.
<div align="left">
<img src="./assets/hyocr-pipeline-v1.png" alt="HunyuanOCR framework" width="80%">
</div>
## 🛠️ Dependencies and Installation
### System Requirements
- 🖥️ Operating System: Linux
- 🐍 Python: 3.12+ (recommended and tested)
- ⚡ CUDA: 12.9
- 🔥 PyTorch: 2.7.1
- 🎮 GPU: NVIDIA GPU with CUDA support
- 🧠 GPU Memory: 20GB (for vLLM)
- 💾 Disk Space: 6GB
## 🚀 Quick Start with vLLM (⭐ Recommended)
- **[HunyuanOCR Usage Guide](https://docs.vllm.ai/projects/recipes/en/latest/Tencent-Hunyuan/HunyuanOCR.html)**
### Installation
```bash
pip install vllm>=0.12.0
pip install -r requirements.txt
```
Note: We suggest to install [cuda-compat-12-9](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/):
```bash
sudo dpkg -i cuda-compat-12-9_575.57.08-0ubuntu1_amd64.deb
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
# verify cuda-compat-12-9
ls /usr/local/cuda-12.9/compat
```
### Model Deploy
```bash
vllm serve tencent/HunyuanOCR \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--gpu-memory-utilization 0.2
```
### Model Inference
```python
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor
def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
if n<8000:
return text
for length in range(2, n // 10 + 1):
candidate = text[-length:]
count = 0
i = n - length
while i >= 0 and text[i:i + length] == candidate:
count += 1
i -= length
if count >= 10:
return text[:n - length * (count - 1)]
return text
model_path = "tencent/HunyuanOCR"
llm = LLM(model=model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path)
sampling_params = SamplingParams(temperature=0, max_tokens=16384)
img_path = "/path/to/image.jpg"
img = Image.open(img_path)
messages = [
{"role": "system", "content": ""},
{"role": "user", "content": [
{"type": "image", "image": img_path},
{"type": "text", "text": "检测并识别图片中的文字,将文本坐标格式化输出。"}
]}
]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = {"prompt": prompt, "multi_modal_data": {"image": [img]}}
output = llm.generate([inputs], sampling_params)[0]
print(clean_repeated_substrings(output.outputs[0].text))
```
### Alternatively, you can also use the provided demo script as follow:
```shell
cd Hunyuan-OCR-master/Hunyuan-OCR-vllm && python run_hy_ocr.py
```
## 🚀 Quick Start with Transformers
### Installation
```bash
pip install git+https://github.com/huggingface/transformers@82a06db03535c49aa987719ed0746a76093b1ec4
```
> **Note**: Currently, Transformers has a certain performance degradation compared to the vLLM framework (we are working hard to fix it), and we will merge the fixed version into the Transformers main branch later.
### Model Inference
```python
from transformers import AutoProcessor
from transformers import HunYuanVLForConditionalGeneration
from PIL import Image
import torch
model_name_or_path = "tencent/HunyuanOCR"
processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
img_path = "path/to/your/image.jpg"
image_inputs = Image.open(img_path)
messages1 = [
{"role": "system", "content": ""},
{
"role": "user",
"content": [
{"type": "image", "image": img_path},
{"type": "text", "text": (
"检测并识别图片中的文字,将文本坐标格式化输出。"
)},
],
}
]
messages = [messages1]
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
inputs = processor(
text=texts,
images=image_inputs,
padding=True,
return_tensors="pt",
)
model = HunYuanVLForConditionalGeneration.from_pretrained(
model_name_or_path,
attn_implementation="eager",
dtype=torch.bfloat16,
device_map="auto"
)
with torch.no_grad():
device = next(model.parameters()).device
inputs = inputs.to(device)
generated_ids = model.generate(**inputs, max_new_tokens=16384, do_sample=False)
if "input_ids" in inputs:
input_ids = inputs.input_ids
else:
print("inputs: # fallback", inputs)
input_ids = inputs.inputs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_texts)
```
### Alternatively, you can also use the provided demo script as follow:
```shell
cd Hunyuan-OCR-master/Hunyuan-OCR-hf && python run_hy_ocr.py
```
## 💬 Application-oriented Prompts
| Task | English | Chinese |
|------|---------|---------|
| **Spotting** | Detect and recognize text in the image, and output the text coordinates in a formatted manner. | 检测并识别图片中的文字,将文本坐标格式化输出。 |
| **Document Parsing** | • Identify the formula in the image and represent it using LaTeX format.<br><br>• Parse the table in the image into HTML.<br><br>• Parse the chart in the image; use Mermaid format for flowcharts and Markdown for other charts.<br><br>• Extract all information from the main body of the document image and represent it in markdown format, ignoring headers and footers. Tables should be expressed in HTML format, formulas in the document should be represented using LaTeX format, and the parsing should be organized according to the reading order. | • 识别图片中的公式,用 LaTeX 格式表示。<br><br>• 把图中的表格解析为 HTML。<br><br>• 解析图中的图表,对于流程图使用 Mermaid 格式表示,其他图表使用 Markdown 格式表示。<br><br>• 提取文档图片中正文的所有信息用 markdown 格式表示,其中页眉、页脚部分忽略,表格用 html 格式表达,文档中公式用 latex 格式表示,按照阅读顺序组织进行解析。|
| **General Parsing** | • Extract the text in the image. | • 提取图中的文字。|
| **Information Extraction** | • Output the value of Key.<br><br>• Extract the content of the fields: ['key1','key2', ...] from the image and return it in JSON format.<br><br>• Extract the subtitles from the image. | • 输出 Key 的值。<br><br>• 提取图片中的: ['key1','key2', ...] 的字段内容,并按照 JSON 格式返回。<br><br>• 提取图片中的字幕。 |
| **Translation** | First extract the text, then translate the text content into English. If it is a document, ignore the header and footer. Formulas should be represented in LaTeX format, and tables should be represented in HTML format. | 先提取文字,再将文字内容翻译为英文。若是文档,则其中页眉、页脚忽略。公式用latex格式表示,表格用html格式表示。 |
## 📊 Evaluation
> **Note**: Evaluation metrics for competing methods are taken from official reports when available; otherwise, they are reproduced using competitor models or interfaces with the recommended standard instructions.
> **Note**: The HunyuanOCR evaluation metrics are derived using the TensorRT framework, which may slightly differ from the inference methods using Transformers or vLLM.
### Text Spotting Performance on In-house Benchmark
| Model Type | Methods | Overall | Art | Doc | Game | Hand | Ads | Receipt | Screen | Scene | Video |
|------------|---------|---------|-----|-----|------|------|-----|----------|---------|--------|--------|
| **Traditional methods** | PaddleOCR | 53.38 | 32.83 | 70.23 | 51.59 | 56.39 | 57.38 | 50.59 | 63.38 | 44.68 | 53.35 |
| **Traditional methods** | BaiduOCR | 61.9 | 38.5 | **78.95** | 59.24 | 59.06 | 66.7 | **63.66** | 68.18 | 55.53 | 67.38 |
| **General VLM** | Qwen3VL-2B-Instruct | 29.68 | 29.43 | 19.37 | 20.85 | 50.57 | 35.14 | 24.42 | 12.13 | 34.90 | 40.1 |
| **General VLM** | Qwen3VL-235B-Instruct | 53.62 | 46.15 | 43.78 | 48.00 | 68.90 | 64.01 | 47.53 | 45.91 | 54.56 | 63.79 |
| **General VLM** | Seed-1.6-Vision | 59.23 | 45.36 | 55.04 | 59.68 | 67.46 | 65.99 | 55.68 | 59.85 | 53.66 | 70.33 |
| **OCR-Specific VLM** | HunyuanOCR | **70.92** | **56.76** | 73.63 | **73.54** | **77.10** | **75.34** | 63.51 | **76.58** | **64.56** | **77.31** |
> **Summary**: HunyuanOCR achieves the best overall performance (70.92%) across different scenarios, significantly outperforming both traditional OCR methods and general VLMs.
### Document Parsing Performance on OmniDocBench and Multilingual In-house Benchmark (Edit Distance)
| Model Type | Method | Size | OmniDocBench | | | | Wild-OmniDocBench | | | | DocML |
|:-----------|:-------|:-----|:---------|:---------|:----------|:--------|:----------|:---------|:----------|:---------|:--------|
| | | | overall | text | formula | table | overall | text | formula | table | |
| **General VLMs** | Gemni-2.5-pro | - | 88.03 | 0.075 | 85.92 | 85.71 | 80.59 | 0.118 | 75.03 | 78.56 | 82.64 |
| **General VLMs** | Qwen3-VL-235B | 235B | 89.15 | 0.069 | 88.14 | 86.21 | 79.69 | 0.09 | 80.67 | 68.31 | 81.40 |
| **Specialized VLMs (Modular)** | MonkeyOCR-pro-3B | 3B | 88.85 | 0.075 | 87.5 | 86.78 | 70.00 | 0.211 | 63.27 | 67.83 | 56.50 |
| **Specialized VLMs (Modular)** | MinerU2.5 | 1.2B | 90.67 | 0.047 | 88.46 | 88.22 | 70.91 | 0.218 | 64.37 | 70.15 | 52.05 |
| **Specialized VLMs (Modular)** | PaddleOCR-VL | 0.9B | 92.86 | 0.035 | 91.22 | 90.89 | 72.19 | 0.232 | 65.54 | 74.24 | 57.42 |
| **Specialized VLMs (End2End)** | Mistral-OCR | - | 78.83 | 0.164 | 82.84 | 70.03 | - | - | - | - | 64.71 |
| **Specialized VLMs (End2End)** | Deepseek-OCR | 3B | 87.01 | 0.073 | 83.37 | 84.97 | 74.23 | 0.178 | 70.07 | 70.41 | 57.22 |
| **Specialized VLMs (End2End)** | dots.ocr | 3B | 88.41 | 0.048 | 83.22 | 86.78 | 78.01 | 0.121 | 74.23 | 71.89 | 77.50 |
| **Specialized VLMs (End2End)** | **HunyuanOCR** | 1B | **94.10** | 0.042 | **94.73** | **91.81** | **85.21** | **0.081** | **82.09** | **81.64** | **91.03** |
> **Summary**: HunyuanOCR demonstrates superior performance in multilingual document parsing, achieving the lowest edit distances across most categories.
### Information Extraction (in-house Benchmark) and VQA Performance (OCRBench)
| Model | Cards | Receipts | Video Subtitles | OCRBench |
|:------|:------|:---------|:----------------|:----------|
| DeepSeek-OCR | 10.04 | 40.54 | 5.41 | 430 |
| PP-ChatOCR | 57.02 | 50.26 | 3.1 | - |
| Qwen3-VL-2B-Instruct | 67.62 | 64.62 | 3.75 | 858 |
| Seed-1.6-Vision | 70.12 | 67.5 | 60.45 | 881 |
| Qwen3-VL-235B-A22B-Instruct | 75.59 | 78.4 | 50.74 | **920** |
| Gemini-2.5-Pro | 80.59 | 80.66 | 53.65 | 872 |
| **HunyuanOCR** | **92.29** | **92.53** | **92.87** | 860 |
> **Summary**: HunyuanOCR significantly outperforms larger models in cards/receipts processing and video subtitle extraction, while maintaining competitive performance on OCRBench.
### Text Image Translation (in-house Benchmark) Performance
| Method | Size | Other2En | Other2Zh | DoTA (en2zh) |
|--------|------|-----------|-----------|--------------|
| Gemini-2.5-Flash | - | 79.26 | 80.06 | 85.60 |
| Qwen3-VL-235B-Instruct | 235B | 73.67 | 77.20 | 80.01 |
| Qwen3-VL-8B-Instruct | 4B | 75.09 | 75.63 | 79.86 |
| Qwen3-VL-4B-Instruct | 4B | 70.38 | 70.29 | 78.45 |
| Qwen3-VL-2B-Instruct | 2B | 66.30 | 66.77 | 73.49 |
| PP-DocTranslation | - | 52.63 | 52.43 | 82.09 |
| **HunyuanOCR** | **1B** | 73.38 | 73.62 | 83.48 |
> **Summary**: HunyuanOCR using only 1B of parameters, achieved comparable results to Qwen3-VL-235B in photo translation tasks.
## 💡 Visualizations
<details>
<summary><u style="color: #2E64FE;">Click here to view detailed results.</u></summary>
### Text Spotting
Our model aims to output the text content and corresponding coordinate information of all text appearing in a text image at the line level. It performs exceptionally well in scenarios such as documents, artistic fonts, street views, handwriting, advertisements, invoices, screenshots, games, and videos.
<p align="left">
<img src="./assets/spotting1_cropped.png" width="40%"/> <br>
<img src="./assets/vis_document_23.jpg" width="40%"/> <br>
</p>
### Complex Document Processing
Digitizing scanned or photographed images of multilingual documents involves, specifically, organizing the text content within the images according to reading order, using LaTeX format for formulas, and expressing complex tables in HTML format.
<p align="left">
<img src="./assets/vis_parsing_fig.png" width="40%"/> <br>
<img src="./assets/show_res_parsing_fig.png" width="40%"/> <br>
<img src="./assets/vis_parsing_table.png" width="40%"/> <br>
<img src="./assets/vis_parsing_table_2.png" width="40%"/> <br>
<img src="./assets/parsing_rgsj.png" width="40%"/> <br>
<img src="./assets/parsing_rgsjz_2.png" width="40%"/> <br>
<img src="./assets/qikai1.png" width="40%"/> <br>
<img src="./assets/guwan1.png" width="40%"/> <br>
<img src="./assets/parsing_chart1.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart1.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart2.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart3.png" width="40%"/> <br>
</p>
### Open-field Information Extraction
For common cards and tickets, fields of interest (such as name/address/company) are parsed using standard JSON format.
<p align="left">
<img src="./assets/vis_ie_1.png" width="40%"/> <br>
</p>
<p align="left">
<img src="./assets/ie_parallel.jpg" width="25%"/> <br>
</p>
**Prompt:**
Extract the content of the fields: ['单价', '上车时间', '发票号码', '省前缀', '总金额', '发票代码', '下车时间', '里程数'] from the image and return it in JSON format.
**Response:**
```json
{
"单价": "3.00",
"上车时间": "09:01",
"发票号码": "42609332",
"省前缀": "陕",
"总金额": "¥77.10元",
"发票代码": "161002018100",
"下车时间": "09:51",
"里程数": "26.1km"
}
```
### Video Subtitle Extraction
Our model is capable of automatically extracting subtitles from videos, including bilingual ones.
<p align="left">
<img src="./assets/vis_subtitle1.png" width="40%"/> <br>
<img src="./assets/vis_subtitle2.png" width="40%"/> <br>
<img src="./assets/vis_subtitle3.png" width="37.5%"/> <br>
</p>
### Image Text Translation
Our model is able to translate images of minor languages ​​taken into Chinese or English text format end-to-end. Currently, it mainly supports 14 frequently used minor languages ​​(specifically including: German, Spanish, Turkish, Italian, Russian, French, Portuguese, Arabic, Thai, Vietnamese, Indonesian, Malay, Japanese, and Korean) into Chinese/English, as well as Chinese-English translation function (it won the small model track championship in the ICDAR2025 document end-to-end translation competition).
<p align="left">
<img src="./assets/translation2.png" width="40%"/> <br>
</p>
</details>
## 📚 Citation
```
@misc{hunyuanvisionteam2025hunyuanocrtechnicalreport,
title={HunyuanOCR Technical Report},
author={Hunyuan Vision Team and Pengyuan Lyu and Xingyu Wan and Gengluo Li and Shangpin Peng and Weinong Wang and Liang Wu and Huawen Shen and Yu Zhou and Canhui Tang and Qi Yang and Qiming Peng and Bin Luo and Hower Yang and Xinsong Zhang and Jinnian Zhang and Houwen Peng and Hongming Yang and Senhao Xie and Longsha Zhou and Ge Pei and Binghong Wu and Kan Wu and Jieneng Yang and Bochao Wang and Kai Liu and Jianchen Zhu and Jie Jiang and Linus and Han Hu and Chengquan Zhang},
year={2025},
journal={arXiv preprint arXiv:2511.19575},
url={https://arxiv.org/abs/2511.19575},
}
```
## 🙏 Acknowledgements
We would like to thank [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [MinerU](https://github.com/opendatalab/MinerU), [MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR), [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR), [dots.ocr](https://github.com/rednote-hilab/dots.ocr) for their valuable models and ideas.
We also appreciate the benchmarks: [OminiDocBench](https://github.com/opendatalab/OmniDocBench), [OCRBench](https://github.com/Yuliang-Liu/MultimodalOCR/tree/main/OCRBench), [DoTA](https://github.com/liangyupu/DIMTDA).
Special thanks to vLLM and Hugging Face Communities for their Day-0 inference supports.
<div align="center">
[English Version](./README.md)
</div>
<div align="center">
# 混元OCR:混元原生多模态端到端 OCR 专家,1B 轻量化参数却斩获多项业界 SOTA!
</div>
<p align="center">
<img src="./assets/hyocr-head-img.png" width="80%"/> <br>
</p>
<p align="center">
<a href="https://huggingface.co/spaces/tencent/HunyuanOCR"><b>🎯 在线Demo体验</b></a> |
<a href="https://huggingface.co/tencent/HunyuanOCR"><b>📥 模型权重下载</b></a> |
<a href="https://arxiv.org/abs/2511.19575"><b>📄 技术报告</b></a>
</p>
## 🤝 加入社群交流
<div align="center">
| 企业微信交流群 | Discord群组 |
| :---: | :---: |
| <img src="./assets/qrcode_for_hunyuanocr_wechat.jpg" width="150"> | [Join HunyuanOCR Discord](https://discord.gg/XeD3p2MRDk) |
</div>
## 🔥 最新动态
- **[2025/11/28]** 🛠️ 我们修复了vLLM推理bug以及system prompt等超参配置问题。建议使用最新的vLLM安装步骤和[推理脚本](https://github.com/Tencent-Hunyuan/HunyuanOCR/blob/main/Hunyuan-OCR-master/Hunyuan-OCR-vllm/run_hy_ocr.py)进行效果测试。目前Transformers相比vLLM框架仍然存在一定的精度差异(正在努力修复中)。
- **[2025/11/25]** 📝 推理代码和模型权重已开源。
## 📖 简介
**HunyuanOCR**是一款基于腾讯混元原生多模态架构的端到端OCR专家模型。仅以**1B**轻量化参数,便已斩获多项业界SOTA成绩。该模型精通**复杂多语种文档解析**,同时在**文字检测识别、开放字段信息抽取、视频字幕识别、拍照翻译**等全场景实用技能中表现出色。
## ✨ 核心特点
- 💪 **轻量化架构**:基于混元原生多模态架构与训练策略,打造仅1B参数的OCR专项模型,大幅降低部署成本。
- 📑 **全场景功能**:单一模型覆盖文字检测和识别、复杂文档解析、卡证票据字段抽取、字幕提取等OCR经典任务,更支持端到端拍照翻译与文档问答。
- 🚀 **极致易用**:深度贯彻大模型"端到端"理念,单一指令、单次推理直达SOTA结果,较业界级联方案更高效便捷。
- 🌏 **多语种支持**:支持超过100种语言,在单语种和混合语言场景下均表现出色。
<div align="left">
<img src="./assets/hyocr-pipeline-v1.png" alt="HunyuanOCR框架" width="80%">
</div>
## 🛠️ 环境依赖与安装
### 系统要求
- 🖥️ 操作系统:Linux
- 🐍 Python版本:3.12+(推荐)
- ⚡ CUDA版本:12.9
- 🔥 PyTorch版本:2.7.1
- 🎮 GPU:支持CUDA的NVIDIA显卡
- 🧠 GPU显存:20GB (for vLLM)
- 💾 磁盘空间:6GB
## 🚀 基于vLLM快速使用 (⭐ 推荐)
- **[HunyuanOCR vLLM使用指南](https://docs.vllm.ai/projects/recipes/en/latest/Tencent-Hunyuan/HunyuanOCR.html)**
### 安装步骤
```bash
pip install vllm>=0.12.0
pip install -r requirements.txt
```
Note: 建议安装[cuda-compat-12-9](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/):
```bash
sudo dpkg -i cuda-compat-12-9_575.57.08-0ubuntu1_amd64.deb
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
# verify cuda-compat-12-9
ls /usr/local/cuda-12.9/compat
```
### 模型部署
```bash
vllm serve tencent/HunyuanOCR \
--no-enable-prefix-caching \
--mm-processor-cache-gb 0 \
--gpu-memory-utilization 0.2
```
### 模型推理
```python
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor
def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
if n<8000:
return text
for length in range(2, n // 10 + 1):
candidate = text[-length:]
count = 0
i = n - length
while i >= 0 and text[i:i + length] == candidate:
count += 1
i -= length
if count >= 10:
return text[:n - length * (count - 1)]
return text
model_path = "tencent/HunyuanOCR"
llm = LLM(model=model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path)
sampling_params = SamplingParams(temperature=0, max_tokens=16384)
img_path = "/path/to/image.jpg"
img = Image.open(img_path)
messages = [
{"role": "system", "content": ""},
{"role": "user", "content": [
{"type": "image", "image": img_path},
{"type": "text", "text": "检测并识别图片中的文字,将文本坐标格式化输出。"}
]}
]
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = {"prompt": prompt, "multi_modal_data": {"image": [img]}}
output = llm.generate([inputs], sampling_params)[0]
print(clean_repeated_substrings(output.outputs[0].text))
```
### 或者,也可以直接使用提供的推理脚本:
```shell
cd Hunyuan-OCR-master/Hunyuan-OCR-vllm && python run_hy_ocr.py
```
## 🚀 基于Transformers快速使用
### 安装步骤
```bash
pip install git+https://github.com/huggingface/transformers@82a06db03535c49aa987719ed0746a76093b1ec4
```
> **Note**: 目前Transformers相比vLLM框架存在一定的性能下滑(正在努力修复中),我们后续会将修复后版本合并到Transformers主分支中。
### 模型推理
HunyuanOCR 提供直观的模型推理接口。以下是使用指引:
```python
from transformers import AutoProcessor
from transformers import HunYuanVLForConditionalGeneration
from PIL import Image
import torch
model_name_or_path = "tencent/HunyuanOCR"
processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
img_path = "path/to/your/image.jpg"
image_inputs = Image.open(img_path)
messages1 = [
{"role": "system", "content": ""},
{
"role": "user",
"content": [
{"type": "image", "image": img_path},
{"type": "text", "text": (
"检测并识别图片中的文字,将文本坐标格式化输出。"
)},
],
}
]
messages = [messages1]
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
inputs = processor(
text=texts,
images=image_inputs,
padding=True,
return_tensors="pt",
)
model = HunYuanVLForConditionalGeneration.from_pretrained(
model_name_or_path,
attn_implementation="eager",
dtype=torch.bfloat16,
device_map="auto"
)
with torch.no_grad():
device = next(model.parameters()).device
inputs = inputs.to(device)
generated_ids = model.generate(**inputs, max_new_tokens=16384, do_sample=False)
if "input_ids" in inputs:
input_ids = inputs.input_ids
else:
print("inputs: # fallback", inputs)
input_ids = inputs.inputs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_texts)
```
#### 或者,也可以直接使用提供的推理脚本:
```shell
cd Hunyuan-OCR-master/Hunyuan-OCR-hf && python run_hy_ocr.py
```
## 💬 推荐的OCR任务提示词
| 任务 | 中文提示词 | 英文提示词 |
|------|---------|---------|
| **文字检测识别** | 检测并识别图片中的文字,将文本坐标格式化输出。 | Detect and recognize text in the image, and output the text coordinates in a formatted manner. |
| **文档解析** | • 识别图片中的公式,用 LaTeX 格式表示。<br><br>• 把图中的表格解析为 HTML。<br><br>• 解析图中的图表,对于流程图使用 Mermaid 格式表示,其他图表使用 Markdown 格式表示。<br><br>• 提取文档图片中正文的所有信息用 markdown 格式表示,其中页眉、页脚部分忽略,表格用 html 格式表达,文档中公式用 latex 格式表示,按照阅读顺序组织进行解析。| • Identify the formula in the image and represent it using LaTeX format.<br><br>• Parse the table in the image into HTML.<br><br>• Parse the chart in the image; use Mermaid format for flowcharts and Markdown for other charts.<br><br>• Extract all information from the main body of the document image and represent it in markdown format, ignoring headers and footers. Tables should be expressed in HTML format, formulas in the document should be represented using LaTeX format, and the parsing should be organized according to the reading order.|
| **通用文字提取** | • 提取图中的文字。 | • Extract the text in the image. |
| **信息抽取** | • 输出 Key 的值。<br><br>• 提取图片中的: ['key1','key2', ...] 的字段内容,并按照 JSON 格式返回。<br><br>• 提取图片中的字幕。 | • Output the value of Key.<br><br>• Extract the content of the fields: ['key1','key2', ...] from the image and return it in JSON format.<br><br>• Extract the subtitles from the image. |
| **翻译** | 先提取文字,再将文字内容翻译为英文。若是文档,则其中页眉、页脚忽略。公式用latex格式表示,表格用html格式表示。 | First extract the text, then translate the text content into English. If it is a document, ignore the header and footer. Formulas should be represented in LaTeX format, and tables should be represented in HTML format. |
## 📊 评测指标
> **Note**: 竞品评测指标来源:官方report指标(如有),或竞品模型/接口以及推荐的标准指令复现所得(如无)。
> **Note**: HunyuanOCR评测指标使用TensorRT框架推理所得,可能与Transformers/vLLM的推理方式存在轻微差异。
### 自建评测集上的文字检测识别指标
| Model Type | Methods | Overall | Art | Doc | Game | Hand | Ads | Receipt | Screen | Scene | Video |
|------------|---------|---------|-----|-----|------|------|-----|----------|---------|--------|--------|
| **Traditional methods** | PaddleOCR | 53.38 | 32.83 | 70.23 | 51.59 | 56.39 | 57.38 | 50.59 | 63.38 | 44.68 | 53.35 |
| **Traditional methods** | BaiduOCR | 61.9 | 38.5 | **78.95** | 59.24 | 59.06 | 66.7 | **63.66** | 68.18 | 55.53 | 67.38 |
| **General VLM** | Qwen3VL-2B-Instruct | 29.68 | 29.43 | 19.37 | 20.85 | 50.57 | 35.14 | 24.42 | 12.13 | 34.90 | 40.1 |
| **General VLM** | Qwen3VL-235B-Instruct | 53.62 | 46.15 | 43.78 | 48.00 | 68.90 | 64.01 | 47.53 | 45.91 | 54.56 | 63.79 |
| **General VLM** | Seed-1.6-Vision | 59.23 | 45.36 | 55.04 | 59.68 | 67.46 | 65.99 | 55.68 | 59.85 | 53.66 | 70.33 |
| **OCR-Specific VLM** | HunyuanOCR | **70.92** | **56.76** | 73.63 | **73.54** | **77.10** | **75.34** | 63.51 | **76.58** | **64.56** | **77.31** |
> **总结**: HunyuanOCR OCR在各种场景下均取得了最佳的整体性能(70.92%),显著优于传统的OCR方法和常见的VLM。
### OmniDocBench 以及自建多语种评测集上的文档解析指标 (使用编辑距离评测)
| Model Type | Method | Size | OmniDocBench | | | | Wild-OmniDocBench | | | | DocML |
|:-----------|:-------|:-----|:---------|:---------|:----------|:--------|:----------|:---------|:----------|:---------|:--------|
| | | | overall | text | formula | table | overall | text | formula | table | |
| **General VLMs** | Gemni-2.5-pro | - | 88.03 | 0.075 | 85.92 | 85.71 | 80.59 | 0.118 | 75.03 | 78.56 | 82.64 |
| **General VLMs** | Qwen3-VL-235B | 235B | 89.15 | 0.069 | 88.14 | 86.21 | 79.69 | 0.09 | 80.67 | 68.31 | 81.40 |
| **Specialized VLMs (Modular)** | MonkeyOCR-pro-3B | 3B | 88.85 | 0.075 | 87.5 | 86.78 | 70.00 | 0.211 | 63.27 | 67.83 | 56.50 |
| **Specialized VLMs (Modular)** | MinerU2.5 | 1.2B | 90.67 | 0.047 | 88.46 | 88.22 | 70.91 | 0.218 | 64.37 | 70.15 | 52.05 |
| **Specialized VLMs (Modular)** | PaddleOCR-VL | 0.9B | 92.86 | 0.035 | 91.22 | 90.89 | 72.19 | 0.232 | 65.54 | 74.24 | 57.42 |
| **Specialized VLMs (End2End)** | Mistral-OCR | - | 78.83 | 0.164 | 82.84 | 70.03 | - | - | - | - | 64.71 |
| **Specialized VLMs (End2End)** | Deepseek-OCR | 3B | 87.01 | 0.073 | 83.37 | 84.97 | 74.23 | 0.178 | 70.07 | 70.41 | 57.22 |
| **Specialized VLMs (End2End)** | dots.ocr | 3B | 88.41 | 0.048 | 83.22 | 86.78 | 78.01 | 0.121 | 74.23 | 71.89 | 77.50 |
| **Specialized VLMs (End2End)** | **HunyuanOCR** | 1B | **94.10** | 0.042 | **94.73** | **91.81** | **85.21** | **0.081** | **82.09** | **81.64** | **91.03** |
> **总结**: HunyuanOCR 在多语种文档解析方面均表现出优异的性能,在大多数类别中实现了最低的编辑距离。
### 信息抽取 (自建评测集) 和 OCRbench的指标
| Model | Cards | Receipts | Video Subtitles | OCRBench |
|:------|:------|:---------|:----------------|:----------|
| DeepSeek-OCR | 10.04 | 40.54 | 5.41 | 430 |
| PP-ChatOCR | 57.02 | 50.26 | 3.1 | - |
| Qwen3-VL-2B-Instruct | 67.62 | 64.62 | 3.75 | 858 |
| Seed-1.6-Vision | 70.12 | 67.5 | 60.45 | 881 |
| Qwen3-VL-235B-A22B-Instruct | 75.59 | 78.4 | 50.74 | **920** |
| Gemini-2.5-Pro | 80.59 | 80.66 | 53.65 | 872 |
| **HunyuanOCR** | **92.29** | **92.53** | **92.87** | 860 |
> **总结**: HunyuanOCR 在卡证票据信息抽取和视频字幕提取任务上,性能均显著优于常见的VLM模型,同时在OCRBench上也达到了同样量级模型的SOTA效果。
### 自建评测集上的拍照翻译指标
| Method | Size | Other2En | Other2Zh | DoTA (en2zh) |
|--------|------|-----------|-----------|--------------|
| Gemini-2.5-Flash | - | 79.26 | 80.06 | 85.60 |
| Qwen3-VL-235B-Instruct | 235B | 73.67 | 77.20 | 80.01 |
| Qwen3-VL-8B-Instruct | 4B | 75.09 | 75.63 | 79.86 |
| Qwen3-VL-4B-Instruct | 4B | 70.38 | 70.29 | 78.45 |
| Qwen3-VL-2B-Instruct | 2B | 66.30 | 66.77 | 73.49 |
| PP-DocTranslation | - | 52.63 | 52.43 | 82.09 |
| **HunyuanOCR** | **1B** | 73.38 | 73.62 | 83.48 |
> **总结**: HunyuanOCR仅使用1B参数量,在拍照翻译任务上取得了与Qwen3-VL-235B相当的效果。
## 💡 效果可视化
<details>
<summary><u style="color: #2E64FE;">点击展开详情</u></summary>
### 文字检测识别
旨在对文字图像中出现的所有文字,按照行级别粒度进行文本内容和对应的坐标信息输出。我们的模型对文档、艺术字、街景、手写、广告、票据、截屏、游戏、视频等场景上表现卓越。
<p align="left">
<img src="./assets/spotting1_cropped.png" width="40%"/> <br>
<img src="./assets/vis_document_23.jpg" width="40%"/> <br>
</p>
### 复杂文档解析
对多语种文档扫描件或拍摄图像进行电子化,具体地,是将图片中出现的文本内容按照阅读顺序进行组织、公式采用Latex格式、复杂表格采用HTML格式表达。
<p align="left">
<img src="./assets/vis_parsing_fig.png" width="40%"/> <br>
<img src="./assets/show_res_parsing_fig.png" width="40%"/> <br>
<img src="./assets/vis_parsing_table.png" width="40%"/> <br>
<img src="./assets/vis_parsing_table_2.png" width="40%"/> <br>
<img src="./assets/parsing_rgsj.png" width="40%"/> <br>
<img src="./assets/parsing_rgsjz_2.png" width="40%"/> <br>
<img src="./assets/qikai1.png" width="40%"/> <br>
<img src="./assets/guwan1.png" width="40%"/> <br>
<img src="./assets/parsing_chart1.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart1.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart2.png" width="40%"/> <br>
<img src="./assets/vis_parsing_chart3.png" width="40%"/> <br>
</p>
### 开放字段信息提取
对常见卡证和票据的感兴趣字段(如姓名/地址/单位等),采用标准的json格式解析。
<p align="left">
<img src="./assets/vis_ie_1.png" width="40%"/> <br>
</p>
<p align="left">
<img src="./assets/ie_parallel.jpg" width="25%"/> <br>
</p>
**Prompt:**
提取图片中的:['单价', '上车时间', '发票号码', '省前缀', '总金额', '发票代码', '下车时间', '里程数']的字段内容,并且按照JSON格式返回。
**Response:**
```json
{
"单价": "3.00",
"上车时间": "09:01",
"发票号码": "42609332",
"省前缀": "陕",
"总金额": "¥77.10元",
"发票代码": "161002018100",
"下车时间": "09:51",
"里程数": "26.1km"
}
```
### 视频字幕提取
能够对视频的字幕实现自动化抽取,包括双语字幕。
<p align="left">
<img src="./assets/vis_subtitle1.png" width="40%"/> <br>
<img src="./assets/vis_subtitle2.png" width="40%"/> <br>
<img src="./assets/vis_subtitle3.png" width="37.5%"/> <br>
</p>
### 图片翻译功能
对拍照或者字典文档的多语种图片能够进行端到端翻译成中文或英文的文本格式输出,目前主要支持14种高频应用小语种(具体包括:德语、西班牙语、土耳其语、意大利语、俄语、法语、葡萄牙语、阿拉伯语、泰语、越南语、印尼语、马来语、日语、韩语)翻译成中/英文,以及中英互译功能(参加ICDAR2025文档端到端翻译比赛赢得了小模型赛道冠军成绩)。
<p align="left">
<img src="./assets/translation2.png" width="40%"/> <br>
</p>
</details>
## 📚 引用
```
@misc{hunyuanvisionteam2025hunyuanocrtechnicalreport,
title={HunyuanOCR Technical Report},
author={Hunyuan Vision Team and Pengyuan Lyu and Xingyu Wan and Gengluo Li and Shangpin Peng and Weinong Wang and Liang Wu and Huawen Shen and Yu Zhou and Canhui Tang and Qi Yang and Qiming Peng and Bin Luo and Hower Yang and Xinsong Zhang and Jinnian Zhang and Houwen Peng and Hongming Yang and Senhao Xie and Longsha Zhou and Ge Pei and Binghong Wu and Kan Wu and Jieneng Yang and Bochao Wang and Kai Liu and Jianchen Zhu and Jie Jiang and Linus and Han Hu and Chengquan Zhang},
year={2025},
journal={arXiv preprint arXiv:2511.19575},
url={https://arxiv.org/abs/2511.19575},
}
```
## 🙏 致谢
我们衷心感谢[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)[MinerU](https://github.com/opendatalab/MinerU)[MonkeyOCR](https://github.com/Yuliang-Liu/MonkeyOCR)[DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR)[dots.ocr](https://github.com/rednote-hilab/dots.ocr) 的作者和贡献者,感谢他们杰出的开源工作和宝贵的研究思路。
同时我们也感谢以下宝贵的开源数据集:[OminiDocBench](https://github.com/opendatalab/OmniDocBench)[OCRBench](https://github.com/Yuliang-Liu/MultimodalOCR/tree/main/OCRBench)[DoTA](https://github.com/liangyupu/DIMTDA)
特别感谢vLLM和Hugging Face社区在推理部署方面所提供的即时支持。
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment