spark_api.py 1.1 KB
Newer Older
1
from loguru import logger
kernel.h@qq.com's avatar
kernel.h@qq.com committed
2

3
from magic_pdf.config.drop_reason import DropReason
kernel.h@qq.com's avatar
kernel.h@qq.com committed
4
5


赵小蒙's avatar
赵小蒙 committed
6
def get_data_source(jso: dict):
7
    data_source = jso.get('data_source')
赵小蒙's avatar
赵小蒙 committed
8
    if data_source is None:
9
        data_source = jso.get('file_source')
赵小蒙's avatar
赵小蒙 committed
10
    return data_source
赵小蒙's avatar
赵小蒙 committed
11
12


赵小蒙's avatar
赵小蒙 committed
13
def get_data_type(jso: dict):
14
    data_type = jso.get('data_type')
赵小蒙's avatar
赵小蒙 committed
15
    if data_type is None:
16
        data_type = jso.get('file_type')
赵小蒙's avatar
赵小蒙 committed
17
    return data_type
kernel.h@qq.com's avatar
kernel.h@qq.com committed
18
19


赵小蒙's avatar
赵小蒙 committed
20
def get_bookid(jso: dict):
21
    book_id = jso.get('bookid')
赵小蒙's avatar
赵小蒙 committed
22
    if book_id is None:
23
        book_id = jso.get('original_file_id')
赵小蒙's avatar
赵小蒙 committed
24
    return book_id
kernel.h@qq.com's avatar
kernel.h@qq.com committed
25
26


赵小蒙's avatar
赵小蒙 committed
27
28
def exception_handler(jso: dict, e):
    logger.exception(e)
29
30
31
    jso['_need_drop'] = True
    jso['_drop_reason'] = DropReason.Exception
    jso['_exception'] = f'ERROR: {e}'
赵小蒙's avatar
赵小蒙 committed
32
    return jso
赵小蒙's avatar
赵小蒙 committed
33

34

赵小蒙's avatar
赵小蒙 committed
35
36
def get_bookname(jso: dict):
    data_source = get_data_source(jso)
37
38
    file_id = jso.get('file_id')
    book_name = f'{data_source}/{file_id}'
赵小蒙's avatar
赵小蒙 committed
39
    return book_name
40
41


42
43
def spark_json_extractor(jso: dict) -> dict:

44
    """从json中提取数据,返回一个dict."""
45
46

    return {
47
48
        '_pdf_type': jso['_pdf_type'],
        'model_list': jso['doc_layout_result'],
49
    }