api.rst 8.33 KB
Newer Older
xu rui's avatar
xu rui committed
1

icecraft's avatar
icecraft committed
2
Api Usage
xu rui's avatar
xu rui committed
3
===========
icecraft's avatar
icecraft committed
4

xu rui's avatar
xu rui committed
5
6
7
8

PDF
----

icecraft's avatar
icecraft committed
9
10
11
Local File Example
^^^^^^^^^^^^^^^^^^

xu rui's avatar
xu rui committed
12
13
14
15
16
.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
xu rui's avatar
xu rui committed
17
18
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
19
    from magic_pdf.config.enums import SupportedPdfParseMethod
xu rui's avatar
xu rui committed
20

xu rui's avatar
xu rui committed
21
    # args
xu rui's avatar
xu rui committed
22
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
xu rui's avatar
xu rui committed
23
    name_without_suff = pdf_file_name.split(".")[0]
xu rui's avatar
xu rui committed
24

xu rui's avatar
xu rui committed
25
    # prepare env
xu rui's avatar
xu rui committed
26
    local_image_dir, local_md_dir = "output/images", "output"
xu rui's avatar
xu rui committed
27
28
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
29
30
31
32
    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
icecraft's avatar
icecraft committed
33
    )
xu rui's avatar
xu rui committed
34

xu rui's avatar
xu rui committed
35
    # read bytes
xu rui's avatar
xu rui committed
36
    reader1 = FileBasedDataReader("")
xu rui's avatar
xu rui committed
37
    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
xu rui's avatar
xu rui committed
38

xu rui's avatar
xu rui committed
39
40
41
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
xu rui's avatar
xu rui committed
42

icecraft's avatar
icecraft committed
43
44
45
46
47
48
49
50
51
52
53
54
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)

        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)

        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)
xu rui's avatar
xu rui committed
55

xu rui's avatar
xu rui committed
56
57
    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
xu rui's avatar
xu rui committed
58

xu rui's avatar
xu rui committed
59
60
61
    ### get model inference result
    model_inference_result = infer_result.get_infer_res()

xu rui's avatar
xu rui committed
62
63
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
64

xu rui's avatar
xu rui committed
65
66
    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
xu rui's avatar
xu rui committed
67

68
69
70
    ### get markdown content
    md_content = pipe_result.get_markdown(image_dir)

xu rui's avatar
xu rui committed
71
72
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
xu rui's avatar
xu rui committed
73

74
75
76
    ### get content list content
    content_list_content = pipe_result.get_content_list(image_dir)

icecraft's avatar
icecraft committed
77
78
79
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

xu rui's avatar
xu rui committed
80
81
82
    ### get middle json
    middle_json_content = pipe_result.get_middle_json()

83
84
85
    ### dump middle json
    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

xu rui's avatar
xu rui committed
86

xu rui's avatar
xu rui committed
87

icecraft's avatar
icecraft committed
88
89
90
91
92
93
94
95
S3 File Example
^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
xu rui's avatar
xu rui committed
96
97
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
xu rui's avatar
xu rui committed
98
    from magic_pdf.config.enums import SupportedPdfParseMethod
icecraft's avatar
icecraft committed
99
100
101
102
103
104
105
106
107

    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
    ak = "{Your S3 access key}"  # replace with real s3 access key
    sk = "{Your S3 secret key}"  # replace with real s3 secret key
    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url

    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
xu rui's avatar
xu rui committed
108
109
110
111
    md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)

    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))
icecraft's avatar
icecraft committed
112

xu rui's avatar
xu rui committed
113
114
    # args
    pdf_file_name = (
xu rui's avatar
xu rui committed
115
        f"s3://{bucket_name}/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
xu rui's avatar
xu rui committed
116
117
118
119
120
    )

    # prepare env
    local_dir = "output"
    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
icecraft's avatar
icecraft committed
121

xu rui's avatar
xu rui committed
122
    # read bytes
icecraft's avatar
icecraft committed
123
124
    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content

xu rui's avatar
xu rui committed
125
126
127
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
icecraft's avatar
icecraft committed
128

icecraft's avatar
icecraft committed
129
130
131
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)
icecraft's avatar
icecraft committed
132

icecraft's avatar
icecraft committed
133
134
135
136
137
        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)
icecraft's avatar
icecraft committed
138

icecraft's avatar
icecraft committed
139
140
141
142
143
        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)

    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
icecraft's avatar
icecraft committed
144

xu rui's avatar
xu rui committed
145
146
147
    ### get model inference result
    model_inference_result = infer_result.get_infer_res()

xu rui's avatar
xu rui committed
148
    ### draw layout result on each page
icecraft's avatar
icecraft committed
149
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
150
151

    ### draw spans result on each page
icecraft's avatar
icecraft committed
152
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
icecraft's avatar
icecraft committed
153

xu rui's avatar
xu rui committed
154
    ### dump markdown
icecraft's avatar
icecraft committed
155
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
icecraft's avatar
icecraft committed
156

icecraft's avatar
icecraft committed
157
158
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
icecraft's avatar
icecraft committed
159

xu rui's avatar
xu rui committed
160
    ### get markdown content
161
    md_content = pipe_result.get_markdown(image_dir)
xu rui's avatar
xu rui committed
162
163

    ### get content list content
164
    content_list_content = pipe_result.get_content_list(image_dir)
xu rui's avatar
xu rui committed
165
166
167
168

    ### get middle json
    middle_json_content = pipe_result.get_middle_json()

169
170
    ### dump middle json
    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
xu rui's avatar
xu rui committed
171

icecraft's avatar
icecraft committed
172
MS-Office
xu rui's avatar
xu rui committed
173
174
----------

icecraft's avatar
icecraft committed
175
.. code:: python
xu rui's avatar
xu rui committed
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_office

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_ppt.ppt"     # replace with real ms-office file

    input_file_name = input_file.split(".")[0]
    ds = read_local_office(input_file)[0]

icecraft's avatar
icecraft committed
200
    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
201
202
203
204
205
206
207
208
209
        md_writer, f"{input_file_name}.md", image_dir
    )

This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file


Image
---------

icecraft's avatar
icecraft committed
210
Single Image File
xu rui's avatar
xu rui committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_image.jpg"       # replace with real image file

    input_file_name = input_file.split(".")[0]
    ds = read_local_images(input_file)[0]

238
    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
239
240
241
242
        md_writer, f"{input_file_name}.md", image_dir
    )


icecraft's avatar
icecraft committed
243
Directory That Contains Images
xu rui's avatar
xu rui committed
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_directory = "some_image_dir/"       # replace with real directory that contains images


xu rui's avatar
xu rui committed
269
    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
xu rui's avatar
xu rui committed
270
271
272

    count = 0
    for ds in dss:
273
        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
274
275
276
277
278
            md_writer, f"{count}.md", image_dir
        )
        count += 1


xu rui's avatar
xu rui committed
279
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details