utils.py 742 Bytes
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from langchain_community.document_loaders import PyMuPDFLoader
import docx
from pptx import Presentation

def extract_text(path):
    return open(path, 'r').read()

def extract_pdf(path):
    loader = PyMuPDFLoader(path)
    data = loader.load()
    data = [x.page_content for x in data]
    content = '\n\n'.join(data)
    return content

def extract_docx(path):
    doc = docx.Document(path)
    data = []
    for paragraph in doc.paragraphs:
        data.append(paragraph.text)
    content = '\n\n'.join(data)

def extract_pptx(path):
    prs = Presentation(path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text