api.rst 6.58 KB
Newer Older
xu rui's avatar
xu rui committed
1

xu rui's avatar
xu rui committed
2
3
Api Usage 
===========
icecraft's avatar
icecraft committed
4

xu rui's avatar
xu rui committed
5
6
7
8

PDF
----

icecraft's avatar
icecraft committed
9
10
11
Local File Example
^^^^^^^^^^^^^^^^^^

xu rui's avatar
xu rui committed
12
13
14
15
16
.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
xu rui's avatar
xu rui committed
17
18
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
xu rui's avatar
xu rui committed
19

xu rui's avatar
xu rui committed
20
    # args
xu rui's avatar
xu rui committed
21
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
xu rui's avatar
xu rui committed
22
    name_without_suff = pdf_file_name.split(".")[0]
xu rui's avatar
xu rui committed
23

xu rui's avatar
xu rui committed
24
    # prepare env
xu rui's avatar
xu rui committed
25
    local_image_dir, local_md_dir = "output/images", "output"
xu rui's avatar
xu rui committed
26
27
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
28
29
30
31
    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
icecraft's avatar
icecraft committed
32
    )
xu rui's avatar
xu rui committed
33
34
    image_dir = str(os.path.basename(local_image_dir))

xu rui's avatar
xu rui committed
35
    # read bytes
xu rui's avatar
xu rui committed
36
    reader1 = FileBasedDataReader("")
xu rui's avatar
xu rui committed
37
    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
xu rui's avatar
xu rui committed
38

xu rui's avatar
xu rui committed
39
40
41
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
xu rui's avatar
xu rui committed
42

xu rui's avatar
xu rui committed
43
44
    ## inference 
    infer_result = ds.apply(doc_analyze, ocr=True)
xu rui's avatar
xu rui committed
45

xu rui's avatar
xu rui committed
46
47
    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
xu rui's avatar
xu rui committed
48

xu rui's avatar
xu rui committed
49
50
    ## pipeline
    pipe_result = infer_result.pipe_ocr_mode(image_writer)
xu rui's avatar
xu rui committed
51

xu rui's avatar
xu rui committed
52
53
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
xu rui's avatar
xu rui committed
54

xu rui's avatar
xu rui committed
55
56
    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
xu rui's avatar
xu rui committed
57

xu rui's avatar
xu rui committed
58
59
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
xu rui's avatar
xu rui committed
60
61


icecraft's avatar
icecraft committed
62
63
64
65
66
67
68
69
S3 File Example
^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
xu rui's avatar
xu rui committed
70
71
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
icecraft's avatar
icecraft committed
72
73
74
75
76
77
78
79
80
81
82

    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
    ak = "{Your S3 access key}"  # replace with real s3 access key
    sk = "{Your S3 secret key}"  # replace with real s3 secret key
    endpoint_url = "{Your S3 endpoint_url}"  # replace with real s3 endpoint_url


    reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url)  # replace `unittest/tmp` with the real s3 prefix
    writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
    image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)

xu rui's avatar
xu rui committed
83
84
85
86
87
88
89
90
    # args
    pdf_file_name = (
        "s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf"  # replace with the real s3 path
    )

    # prepare env
    local_dir = "output"
    name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
icecraft's avatar
icecraft committed
91

xu rui's avatar
xu rui committed
92
    # read bytes
icecraft's avatar
icecraft committed
93
94
    pdf_bytes = reader.read(pdf_file_name)  # read the pdf content

xu rui's avatar
xu rui committed
95
96
97
    # proc
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)
icecraft's avatar
icecraft committed
98

xu rui's avatar
xu rui committed
99
100
    ## inference 
    infer_result = ds.apply(doc_analyze, ocr=True)
icecraft's avatar
icecraft committed
101

xu rui's avatar
xu rui committed
102
103
    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf'))  # dump to local
icecraft's avatar
icecraft committed
104

xu rui's avatar
xu rui committed
105
106
    ## pipeline
    pipe_result = infer_result.pipe_ocr_mode(image_writer)
icecraft's avatar
icecraft committed
107

xu rui's avatar
xu rui committed
108
109
110
111
112
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf'))  # dump to local

    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))   # dump to local 
icecraft's avatar
icecraft committed
113

xu rui's avatar
xu rui committed
114
115
    ### dump markdown
    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
icecraft's avatar
icecraft committed
116
117


xu rui's avatar
xu rui committed
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

MS-Office 
----------

.. code:: python 

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_office

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_ppt.ppt"     # replace with real ms-office file

    input_file_name = input_file.split(".")[0]
    ds = read_local_office(input_file)[0]

    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
        md_writer, f"{input_file_name}.md", image_dir
    )

This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file


Image
---------

Single Image File 
^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_file = "some_image.jpg"       # replace with real image file

    input_file_name = input_file.split(".")[0]
    ds = read_local_images(input_file)[0]

    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
        md_writer, f"{input_file_name}.md", image_dir
    )


Directory That Contains Images 
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. code:: python

    import os

    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.data.read_api import read_local_images

    # prepare env
    local_image_dir, local_md_dir = "output/images", "output"
    image_dir = str(os.path.basename(local_image_dir))

    os.makedirs(local_image_dir, exist_ok=True)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )

    # proc
    ## Create Dataset Instance
    input_directory = "some_image_dir/"       # replace with real directory that contains images


    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  

    count = 0
    for ds in dss:
        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
            md_writer, f"{count}.md", image_dir
        )
        count += 1


xu rui's avatar
xu rui committed
226
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details