api.rst 8.51 KB
Newer Older
xu rui's avatar
xu rui committed
1

icecraft's avatar
icecraft committed
2
Api Usage
xu rui's avatar
xu rui committed
3
===========
icecraft's avatar
icecraft committed
4

xu rui's avatar
xu rui committed
5
6
7
8

PDF
----

icecraft's avatar
icecraft committed
9
10
11
Local File Example
^^^^^^^^^^^^^^^^^^

xu rui's avatar
xu rui committed
12
13
14
15
16
.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
xu rui's avatar
xu rui committed
17
18
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
19
    from magic_pdf.config.enums import SupportedPdfParseMethod
xu rui's avatar
xu rui committed
20
    from magic_pdf.config.make_content_config import DropMode, MakeMode
xu rui's avatar
xu rui committed
21

xu rui's avatar
xu rui committed
22
    # args
xu rui's avatar
xu rui committed
23
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
xu rui's avatar
xu rui committed
24
    name_without_suff = pdf_file_name.split(".")[0]
xu rui's avatar
xu rui committed
25

xu rui's avatar
xu rui committed
26
    # prepare env
xu rui's avatar
xu rui committed
27
    local_image_dir, local_md_dir = "output/images", "output"
xu rui's avatar
xu rui committed
28
29
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
30
31
32
33
    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
icecraft's avatar
icecraft committed
34
    )
xu rui's avatar
xu rui committed
35

xu rui's avatar
xu rui committed
36
    # read bytes
xu rui's avatar
xu rui committed
37
    reader1 = FileBasedDataReader("")
xu rui's avatar
xu rui committed
38
    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
xu rui's avatar
xu rui committed
39

xu rui's avatar
xu rui committed
40
41
42
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
xu rui's avatar
xu rui committed
43

icecraft's avatar
icecraft committed
44
45
46
47
48
49
50
51
52
53
54
55
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)

        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)

        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)
xu rui's avatar
xu rui committed
56

xu rui's avatar
xu rui committed
57
58
    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
xu rui's avatar
xu rui committed
59

xu rui's avatar
xu rui committed
60
61
62
    ### get model inference result
    model_inference_result = infer_result.get_infer_res()

xu rui's avatar
xu rui committed
63
64
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
65

xu rui's avatar
xu rui committed
66
67
    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
xu rui's avatar
xu rui committed
68

xu rui's avatar
xu rui committed
69
70
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
xu rui's avatar
xu rui committed
71

icecraft's avatar
icecraft committed
72
73
74
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

xu rui's avatar
xu rui committed
75
76
77
78
79
80
81
82
83
84
    ### get markdown content
    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)

    ### get content list content
    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 

    ### get middle json
    middle_json_content = pipe_result.get_middle_json()


xu rui's avatar
xu rui committed
85

icecraft's avatar
icecraft committed
86
87
88
89
90
91
92
93
S3 File Example
^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
xu rui's avatar
xu rui committed
94
95
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
xu rui's avatar
xu rui committed
96
97
    from magic_pdf.config.make_content_config import DropMode, MakeMode
    from magic_pdf.config.enums import SupportedPdfParseMethod
icecraft's avatar
icecraft committed
98
99
100
101
102
103
104
105
106

    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
    ak = "{Your S3 access key}"  # replace with real s3 access key
    sk = "{Your S3 secret key}"  # replace with real s3 secret key
    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url

    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
xu rui's avatar
xu rui committed
107
108
109
110
    md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)

    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))
icecraft's avatar
icecraft committed
111

xu rui's avatar
xu rui committed
112
113
    # args
    pdf_file_name = (
xu rui's avatar
xu rui committed
114
        f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
xu rui's avatar
xu rui committed
115
116
117
118
119
    )

    # prepare env
    local_dir = "output"
    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
icecraft's avatar
icecraft committed
120

xu rui's avatar
xu rui committed
121
    # read bytes
icecraft's avatar
icecraft committed
122
123
    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content

xu rui's avatar
xu rui committed
124
125
126
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
icecraft's avatar
icecraft committed
127

icecraft's avatar
icecraft committed
128
129
130
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)
icecraft's avatar
icecraft committed
131

icecraft's avatar
icecraft committed
132
133
134
135
136
        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)
icecraft's avatar
icecraft committed
137

icecraft's avatar
icecraft committed
138
139
140
141
142
        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)

    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
icecraft's avatar
icecraft committed
143

xu rui's avatar
xu rui committed
144
145
146
    ### get model inference result
    model_inference_result = infer_result.get_infer_res()

xu rui's avatar
xu rui committed
147
    ### draw layout result on each page
icecraft's avatar
icecraft committed
148
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
149
150

    ### draw spans result on each page
icecraft's avatar
icecraft committed
151
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
icecraft's avatar
icecraft committed
152

xu rui's avatar
xu rui committed
153
    ### dump markdown
icecraft's avatar
icecraft committed
154
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
icecraft's avatar
icecraft committed
155

icecraft's avatar
icecraft committed
156
157
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
icecraft's avatar
icecraft committed
158

xu rui's avatar
xu rui committed
159
160
161
162
163
164
165
166
167
168
    ### get markdown content
    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)

    ### get content list content
    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 

    ### get middle json
    middle_json_content = pipe_result.get_middle_json()


xu rui's avatar
xu rui committed
169

icecraft's avatar
icecraft committed
170
MS-Office
xu rui's avatar
xu rui committed
171
172
----------

icecraft's avatar
icecraft committed
173
.. code:: python
xu rui's avatar
xu rui committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_office

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_ppt.ppt"     # replace with real ms-office file

    input_file_name = input_file.split(".")[0]
    ds = read_local_office(input_file)[0]

icecraft's avatar
icecraft committed
198
    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
199
200
201
202
203
204
205
206
207
        md_writer, f"{input_file_name}.md", image_dir
    )

This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file


Image
---------

icecraft's avatar
icecraft committed
208
Single Image File
xu rui's avatar
xu rui committed
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_image.jpg"       # replace with real image file

    input_file_name = input_file.split(".")[0]
    ds = read_local_images(input_file)[0]

236
    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
237
238
239
240
        md_writer, f"{input_file_name}.md", image_dir
    )


icecraft's avatar
icecraft committed
241
Directory That Contains Images
xu rui's avatar
xu rui committed
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_directory = "some_image_dir/"       # replace with real directory that contains images


xu rui's avatar
xu rui committed
267
    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
xu rui's avatar
xu rui committed
268
269
270

    count = 0
    for ds in dss:
271
        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
272
273
274
275
276
            md_writer, f"{count}.md", image_dir
        )
        count += 1


xu rui's avatar
xu rui committed
277
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details