"vscode:/vscode.git/clone" did not exist on "a5c38ebea7a0287872f421f872b548bd74e0213b"
main_text_font.py.bak 1.02 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import collections


def get_main_text_font(pdf_docs):
    font_names = collections.Counter()
    for page in pdf_docs:
        blocks = page.get_text('dict')['blocks']
        if blocks is not None:
            for block in blocks:
                lines = block.get('lines')
                if lines is not None:
                    for line in lines:
                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
                                     'font' in span and len(span['text']) > 0]
                        if span_font:
                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
                            # font_names.append(font_name for font_name in span_font)
                            # block_fonts.append(font_name for font_name in span_font)
                            for font, count in span_font:
                                font_names[font] += count
    main_text_font = font_names.most_common(1)[0][0]
    return main_text_font