switch to using BeautifulSoup HTML loader so title is also captured

c91a5d8b · Doug Danat · 77f4ffdd · c91a5d8b
Commit c91a5d8b authored Mar 25, 2024 by Doug Danat
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

backend/apps/rag/main.py backend/apps/rag/main.py +2 -2

No files found.
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -21,7 +21,7 @@ from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    CSVLoader,
-    UnstructuredHTMLLoader,
+    BSHTMLLoader,
    Docx2txtLoader,
    UnstructuredEPubLoader,
    UnstructuredWordDocumentLoader,
@@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
    elif file_ext == "xml":
        loader = UnstructuredXMLLoader(file_path)
    elif file_ext in ["htm", "html"]:
-        loader = UnstructuredHTMLLoader(file_path)
+        loader = BSHTMLLoader(file_path)
    elif file_ext == "md":
        loader = UnstructuredMarkdownLoader(file_path)
    elif file_content_type == "application/epub+zip":