Unverified Commit dbde6281 authored by Timothy Jaeryang Baek's avatar Timothy Jaeryang Baek Committed by GitHub
Browse files

Merge pull request #2923 from mindspawn/outlook-msg

Support Outlook Message File Format
parents 8df0c9e0 2412f31e
...@@ -9,6 +9,7 @@ from fastapi import ( ...@@ -9,6 +9,7 @@ from fastapi import (
) )
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import os, shutil, logging, re import os, shutil, logging, re
from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Union, Sequence from typing import List, Union, Sequence
...@@ -30,6 +31,7 @@ from langchain_community.document_loaders import ( ...@@ -30,6 +31,7 @@ from langchain_community.document_loaders import (
UnstructuredExcelLoader, UnstructuredExcelLoader,
UnstructuredPowerPointLoader, UnstructuredPowerPointLoader,
YoutubeLoader, YoutubeLoader,
OutlookMessageLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
...@@ -879,6 +881,13 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b ...@@ -879,6 +881,13 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b
texts = [doc.page_content for doc in docs] texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs] metadatas = [doc.metadata for doc in docs]
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
for key, value in metadata.items():
if isinstance(value, datetime):
metadata[key] = str(value)
try: try:
if overwrite: if overwrite:
for collection in CHROMA_CLIENT.list_collections(): for collection in CHROMA_CLIENT.list_collections():
...@@ -965,6 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): ...@@ -965,6 +974,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"swift", "swift",
"vue", "vue",
"svelte", "svelte",
"msg",
] ]
if file_ext == "pdf": if file_ext == "pdf":
...@@ -999,6 +1009,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): ...@@ -999,6 +1009,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
] or file_ext in ["ppt", "pptx"]: ] or file_ext in ["ppt", "pptx"]:
loader = UnstructuredPowerPointLoader(file_path) loader = UnstructuredPowerPointLoader(file_path)
elif file_ext == "msg":
loader = OutlookMessageLoader(file_path)
elif file_ext in known_source_ext or ( elif file_ext in known_source_ext or (
file_content_type and file_content_type.find("text/") >= 0 file_content_type and file_content_type.find("text/") >= 0
): ):
......
...@@ -56,4 +56,6 @@ PyJWT[crypto]==2.8.0 ...@@ -56,4 +56,6 @@ PyJWT[crypto]==2.8.0
black==24.4.2 black==24.4.2
langfuse==2.33.0 langfuse==2.33.0
youtube-transcript-api==0.6.2 youtube-transcript-api==0.6.2
pytube==15.0.0 pytube==15.0.0
\ No newline at end of file
extract_msg
...@@ -89,7 +89,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [ ...@@ -89,7 +89,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
'xls', 'xls',
'xlsx', 'xlsx',
'pptx', 'pptx',
'ppt' 'ppt',
'msg'
]; ];
// Source: https://kit.svelte.dev/docs/modules#$env-static-public // Source: https://kit.svelte.dev/docs/modules#$env-static-public
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment