api.rst 7.22 KB
Newer Older
xu rui's avatar
xu rui committed
1

icecraft's avatar
icecraft committed
2
Api Usage
xu rui's avatar
xu rui committed
3
===========
icecraft's avatar
icecraft committed
4

xu rui's avatar
xu rui committed
5
6
7
8

PDF
----

icecraft's avatar
icecraft committed
9
10
11
Local File Example
^^^^^^^^^^^^^^^^^^

xu rui's avatar
xu rui committed
12
13
14
15
16
.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
xu rui's avatar
xu rui committed
17
18
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
19
    from magic_pdf.config.enums import SupportedPdfParseMethod
xu rui's avatar
xu rui committed
20

xu rui's avatar
xu rui committed
21
    # args
xu rui's avatar
xu rui committed
22
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
xu rui's avatar
xu rui committed
23
    name_without_suff = pdf_file_name.split(".")[0]
xu rui's avatar
xu rui committed
24

xu rui's avatar
xu rui committed
25
    # prepare env
xu rui's avatar
xu rui committed
26
    local_image_dir, local_md_dir = "output/images", "output"
xu rui's avatar
xu rui committed
27
28
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
29
30
31
32
    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
icecraft's avatar
icecraft committed
33
    )
xu rui's avatar
xu rui committed
34
35
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
36
    # read bytes
xu rui's avatar
xu rui committed
37
    reader1 = FileBasedDataReader("")
xu rui's avatar
xu rui committed
38
    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
xu rui's avatar
xu rui committed
39

xu rui's avatar
xu rui committed
40
41
42
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
xu rui's avatar
xu rui committed
43

icecraft's avatar
icecraft committed
44
45
46
47
48
49
50
51
52
53
54
55
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)

        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)

        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)
xu rui's avatar
xu rui committed
56

xu rui's avatar
xu rui committed
57
58
    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
xu rui's avatar
xu rui committed
59

xu rui's avatar
xu rui committed
60
61
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
62

xu rui's avatar
xu rui committed
63
64
    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
xu rui's avatar
xu rui committed
65

xu rui's avatar
xu rui committed
66
67
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
xu rui's avatar
xu rui committed
68

icecraft's avatar
icecraft committed
69
70
71
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

xu rui's avatar
xu rui committed
72

icecraft's avatar
icecraft committed
73
74
75
76
77
78
79
80
S3 File Example
^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
xu rui's avatar
xu rui committed
81
82
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
83
84
85
86
87
88
89
90
91
92
93

    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
    ak = "{Your S3 access key}"  # replace with real s3 access key
    sk = "{Your S3 secret key}"  # replace with real s3 secret key
    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url


    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)

xu rui's avatar
xu rui committed
94
95
96
97
98
99
100
101
    # args
    pdf_file_name = (
        "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
    )

    # prepare env
    local_dir = "output"
    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
icecraft's avatar
icecraft committed
102

xu rui's avatar
xu rui committed
103
    # read bytes
icecraft's avatar
icecraft committed
104
105
    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content

xu rui's avatar
xu rui committed
106
107
108
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
icecraft's avatar
icecraft committed
109

icecraft's avatar
icecraft committed
110
111
112
    ## inference
    if ds.classify() == SupportedPdfParseMethod.OCR:
        infer_result = ds.apply(doc_analyze, ocr=True)
icecraft's avatar
icecraft committed
113

icecraft's avatar
icecraft committed
114
115
116
117
118
        ## pipeline
        pipe_result = infer_result.pipe_ocr_mode(image_writer)

    else:
        infer_result = ds.apply(doc_analyze, ocr=False)
icecraft's avatar
icecraft committed
119

icecraft's avatar
icecraft committed
120
121
122
123
124
        ## pipeline
        pipe_result = infer_result.pipe_txt_mode(image_writer)

    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
icecraft's avatar
icecraft committed
125

xu rui's avatar
xu rui committed
126
    ### draw layout result on each page
icecraft's avatar
icecraft committed
127
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
128
129

    ### draw spans result on each page
icecraft's avatar
icecraft committed
130
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
icecraft's avatar
icecraft committed
131

xu rui's avatar
xu rui committed
132
    ### dump markdown
icecraft's avatar
icecraft committed
133
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
icecraft's avatar
icecraft committed
134

icecraft's avatar
icecraft committed
135
136
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
icecraft's avatar
icecraft committed
137

xu rui's avatar
xu rui committed
138

icecraft's avatar
icecraft committed
139
MS-Office
xu rui's avatar
xu rui committed
140
141
----------

icecraft's avatar
icecraft committed
142
.. code:: python
xu rui's avatar
xu rui committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_office

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_ppt.ppt"     # replace with real ms-office file

    input_file_name = input_file.split(".")[0]
    ds = read_local_office(input_file)[0]

icecraft's avatar
icecraft committed
167
    ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
168
169
170
171
172
173
174
175
176
        md_writer, f"{input_file_name}.md", image_dir
    )

This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file


Image
---------

icecraft's avatar
icecraft committed
177
Single Image File
xu rui's avatar
xu rui committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_image.jpg"       # replace with real image file

    input_file_name = input_file.split(".")[0]
    ds = read_local_images(input_file)[0]

205
    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
206
207
208
209
        md_writer, f"{input_file_name}.md", image_dir
    )


icecraft's avatar
icecraft committed
210
Directory That Contains Images
xu rui's avatar
xu rui committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_directory = "some_image_dir/"       # replace with real directory that contains images


xu rui's avatar
xu rui committed
236
    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])
xu rui's avatar
xu rui committed
237
238
239

    count = 0
    for ds in dss:
240
        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
xu rui's avatar
xu rui committed
241
242
243
244
245
            md_writer, f"{count}.md", image_dir
        )
        count += 1


xu rui's avatar
xu rui committed
246
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details