pre_clean.py 4.1 KB
Newer Older
quyuan's avatar
add ci  
quyuan committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
clean data
"""
import argparse
import os
import re
import htmltabletomd # type: ignore
import pypandoc
import argparse

parser = argparse.ArgumentParser(description="get tool type")
parser.add_argument(
    "--tool_name",
    type=str,
    required=True,
    help="input tool name",
)
parser.add_argument(
    "--download_dir",
    type=str,
    required=True,
    help="input download dir",
)
args = parser.parse_args()

def clean_markdown_images(content):
    """
    clean markdown images
    """
    pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
    cleaned_content = pattern.sub('', content)   
    return cleaned_content
   
def clean_ocrmath_photo(content):
    """
    clean ocrmath photo
    """
    pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
    cleaned_content = pattern.sub('', content)   
    return cleaned_content

def convert_html_table_to_md(html_table):
    """
    convert html table to markdown table
    """
    lines = html_table.strip().split('\n')  
    md_table = ''  
    if lines and '<tr>' in lines[0]:  
        in_thead = True  
        for line in lines:  
            if '<th>' in line:  
                cells = re.findall(r'<th>(.*?)</th>', line)  
                md_table += '| ' + ' | '.join(cells) + ' |\n'  
                in_thead = False  
            elif '<td>' in line and not in_thead:  
                cells = re.findall(r'<td>(.*?)</td>', line)  
                md_table += '| ' + ' | '.join(cells) + ' |\n'  
        md_table = md_table.rstrip() + '\n'    
    return md_table  
 
def convert_latext_to_md(content):
    """
    convert latex table to markdown table
    """
    tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
    placeholders = []  
    for table in tables:  
        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
        replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
        content = content.replace(replace_str, placeholder)  
        try:
            pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
        except:
            markdown_string = replace_str
        else: 
            markdown_string = open('output.md', 'r', encoding='utf-8').read()
        placeholders.append((placeholder, markdown_string)) 
    new_content = content  
    for placeholder, md_table in placeholders:  
        new_content = new_content.replace(placeholder, md_table)  
        # 写入文件  
    return new_content

 
def convert_htmltale_to_md(content):
    """
    convert html table to markdown table
    """
    tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
    placeholders = []
    for table in tables:
        placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
        content = content.replace(f"<table>{table}</table>", placeholder)  
        try:
            convert_table = htmltabletomd.convert_table(table)
        except:
            convert_table = table
        placeholders.append((placeholder,convert_table)) 
    new_content = content  
    for placeholder, md_table in placeholders:  
        new_content = new_content.replace(placeholder, md_table)  
        # 写入文件  
    return new_content

def clean_data(prod_type, download_dir):
    """
    clean data
    """
    tgt_dir = os.path.join(download_dir, prod_type, "cleaned")
    if not os.path.exists(tgt_dir):  
        os.makedirs(tgt_dir) 
    source_dir = os.path.join(download_dir, prod_type)
    filenames = os.listdir(source_dir)
    for filename in filenames:
        if filename.endswith('.md'):
            input_file = os.path.join(source_dir, filename)
            output_file = os.path.join(tgt_dir, "cleaned_" + filename)
            with open(input_file, 'r', encoding='utf-8') as fr:
                content = fr.read()
                new_content = clean_markdown_images(content)
                with open(output_file, 'w', encoding='utf-8') as fw:
                    fw.write(new_content)


if __name__ == '__main__':
    tool_type = args.tool_name
    download_dir = args.download_dir
    clean_data(tool_type, download_dir)