utils.py 765 Bytes
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
import docx
Rayyyyy's avatar
Rayyyyy committed
2
from langchain_community.document_loaders import PyMuPDFLoader
Rayyyyy's avatar
Rayyyyy committed
3
4
from pptx import Presentation

Rayyyyy's avatar
Rayyyyy committed
5

Rayyyyy's avatar
Rayyyyy committed
6
def extract_text(path):
Rayyyyy's avatar
Rayyyyy committed
7
8
    return open(path, "r").read()

Rayyyyy's avatar
Rayyyyy committed
9
10
11
12
13

def extract_pdf(path):
    loader = PyMuPDFLoader(path)
    data = loader.load()
    data = [x.page_content for x in data]
Rayyyyy's avatar
Rayyyyy committed
14
    content = "\n\n".join(data)
Rayyyyy's avatar
Rayyyyy committed
15
16
    return content

Rayyyyy's avatar
Rayyyyy committed
17

Rayyyyy's avatar
Rayyyyy committed
18
19
20
21
22
def extract_docx(path):
    doc = docx.Document(path)
    data = []
    for paragraph in doc.paragraphs:
        data.append(paragraph.text)
Rayyyyy's avatar
Rayyyyy committed
23
24
25
    content = "\n\n".join(data)
    return content

Rayyyyy's avatar
Rayyyyy committed
26
27
28
29
30
31
32
33
34

def extract_pptx(path):
    prs = Presentation(path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text