Commit c91a5d8b authored by Doug Danat's avatar Doug Danat
Browse files

switch to using BeautifulSoup HTML loader so title is also captured

parent 77f4ffdd
...@@ -21,7 +21,7 @@ from langchain_community.document_loaders import ( ...@@ -21,7 +21,7 @@ from langchain_community.document_loaders import (
TextLoader, TextLoader,
PyPDFLoader, PyPDFLoader,
CSVLoader, CSVLoader,
UnstructuredHTMLLoader, BSHTMLLoader,
Docx2txtLoader, Docx2txtLoader,
UnstructuredEPubLoader, UnstructuredEPubLoader,
UnstructuredWordDocumentLoader, UnstructuredWordDocumentLoader,
...@@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): ...@@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
elif file_ext == "xml": elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path) loader = UnstructuredXMLLoader(file_path)
elif file_ext in ["htm", "html"]: elif file_ext in ["htm", "html"]:
loader = UnstructuredHTMLLoader(file_path) loader = BSHTMLLoader(file_path)
elif file_ext == "md": elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path) loader = UnstructuredMarkdownLoader(file_path)
elif file_content_type == "application/epub+zip": elif file_content_type == "application/epub+zip":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment