Commit aa1d3860 authored by Marclass's avatar Marclass
Browse files

Allow any file to be used for RAG.

Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file.
parent 6070e6bc
...@@ -144,36 +144,20 @@ def store_doc( ...@@ -144,36 +144,20 @@ def store_doc(
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
print(file.content_type) print(file.content_type)
if file.content_type not in [
"application/pdf", text_xml=["xml"]
"text/plain",
"text/csv",
"text/xml",
"text/x-python",
"text/css",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/octet-stream",
"application/x-javascript",
]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
text_xml=["text/xml"]
octet_markdown=["md"] octet_markdown=["md"]
octet_plain=[ known_source_ext=[
"go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "go", "py", "java", "sh", "bat", "ps1", "cmd", "js",
"css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
"pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
"hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
"rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
] ]
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
known_doc_ext=["doc","docx"]
file_ext=file.filename.split(".")[-1].lower() file_ext=file.filename.split(".")[-1].lower()
if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): known_type=True
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
try: try:
filename = file.filename filename = file.filename
...@@ -188,27 +172,22 @@ def store_doc( ...@@ -188,27 +172,22 @@ def store_doc(
collection_name = calculate_sha256(f)[:63] collection_name = calculate_sha256(f)[:63]
f.close() f.close()
if file.content_type == "application/pdf": if file_ext=="pdf":
loader = PyPDFLoader(file_path) loader = PyPDFLoader(file_path)
elif ( elif (file.content_type ==docx_type or file_ext in known_doc_ext):
file.content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file_ext=="csv":
elif file.content_type == "text/csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file.content_type in text_xml: elif file_ext in text_xml:
loader=UnstructuredXMLLoader(file_path) loader=UnstructuredXMLLoader(file_path)
elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
loader = TextLoader(file_path) loader = TextLoader(file_path)
elif file.content_type == "application/octet-stream": elif file_ext in octet_markdown:
if file_ext in octet_markdown:
loader = UnstructuredMarkdownLoader(file_path) loader = UnstructuredMarkdownLoader(file_path)
if file_ext in octet_plain: else:
loader = TextLoader(file_path)
elif file.content_type == "application/x-javascript":
loader = TextLoader(file_path) loader = TextLoader(file_path)
known_type=False
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name) result = store_data_in_vector_db(data, collection_name)
...@@ -218,6 +197,7 @@ def store_doc( ...@@ -218,6 +197,7 @@ def store_doc(
"status": True, "status": True,
"collection_name": collection_name, "collection_name": collection_name,
"filename": filename, "filename": filename,
"known_type":known_type,
} }
else: else:
raise HTTPException( raise HTTPException(
......
...@@ -173,7 +173,8 @@ ...@@ -173,7 +173,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);
...@@ -308,8 +309,9 @@ ...@@ -308,8 +309,9 @@
uploadDoc(file); uploadDoc(file);
filesInputElement.value = ''; filesInputElement.value = '';
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
inputFiles = null; uploadDoc(file);
filesInputElement.value = '';
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);
......
...@@ -73,7 +73,8 @@ ...@@ -73,7 +73,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);
...@@ -153,7 +154,8 @@ ...@@ -153,7 +154,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
inputFiles = null; inputFiles = null;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment