Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
open-webui
Commits
a1fc2f4d
Unverified
Commit
a1fc2f4d
authored
Mar 25, 2024
by
Timothy Jaeryang Baek
Committed by
GitHub
Mar 25, 2024
Browse files
Merge pull request #1292 from ddanat-smm/dev
Add htm/html support for RAG documents
parents
4f78acaa
3688955c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
64 additions
and
43 deletions
+64
-43
backend/apps/rag/main.py
backend/apps/rag/main.py
+59
-43
backend/constants.py
backend/constants.py
+2
-0
src/lib/constants.ts
src/lib/constants.ts
+3
-0
No files found.
backend/apps/rag/main.py
View file @
a1fc2f4d
...
...
@@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
TextLoader
,
PyPDFLoader
,
CSVLoader
,
BSHTMLLoader
,
Docx2txtLoader
,
UnstructuredEPubLoader
,
UnstructuredWordDocumentLoader
,
...
...
@@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel):
class
StoreWebForm
(
CollectionNameForm
):
url
:
str
@
app
.
get
(
"/"
)
async
def
get_status
():
return
{
...
...
@@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
def
store_data_in_vector_db
(
data
,
collection_name
,
overwrite
:
bool
=
False
)
->
bool
:
text_splitter
=
RecursiveCharacterTextSplitter
(
chunk_size
=
app
.
state
.
CHUNK_SIZE
,
chunk_overlap
=
app
.
state
.
CHUNK_OVERLAP
,
add_start_index
=
True
,
)
docs
=
text_splitter
.
split_documents
(
data
)
return
store_docs_in_vector_db
(
docs
,
collection_name
,
overwrite
)
if
len
(
docs
)
>
0
:
return
store_docs_in_vector_db
(
docs
,
collection_name
,
overwrite
),
None
else
:
raise
ValueError
(
ERROR_MESSAGES
.
EMPTY_CONTENT
)
def
store_text_in_vector_db
(
...
...
@@ -318,6 +325,7 @@ def store_text_in_vector_db(
def
store_docs_in_vector_db
(
docs
,
collection_name
,
overwrite
:
bool
=
False
)
->
bool
:
texts
=
[
doc
.
page_content
for
doc
in
docs
]
metadatas
=
[
doc
.
metadata
for
doc
in
docs
]
...
...
@@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
loader
=
UnstructuredRSTLoader
(
file_path
,
mode
=
"elements"
)
elif
file_ext
==
"xml"
:
loader
=
UnstructuredXMLLoader
(
file_path
)
elif
file_ext
in
[
"htm"
,
"html"
]:
loader
=
BSHTMLLoader
(
file_path
,
open_encoding
=
"unicode_escape"
)
elif
file_ext
==
"md"
:
loader
=
UnstructuredMarkdownLoader
(
file_path
)
elif
file_content_type
==
"application/epub+zip"
:
...
...
@@ -452,6 +462,8 @@ def store_doc(
loader
,
known_type
=
get_loader
(
file
.
filename
,
file
.
content_type
,
file_path
)
data
=
loader
.
load
()
try
:
result
=
store_data_in_vector_db
(
data
,
collection_name
)
if
result
:
...
...
@@ -461,10 +473,10 @@ def store_doc(
"filename"
:
filename
,
"known_type"
:
known_type
,
}
e
ls
e
:
e
xcept
Exception
as
e
:
raise
HTTPException
(
status_code
=
status
.
HTTP_500_INTERNAL_SERVER_ERROR
,
detail
=
ERROR_MESSAGES
.
DEFAULT
()
,
detail
=
e
,
)
except
Exception
as
e
:
log
.
exception
(
e
)
...
...
@@ -529,6 +541,7 @@ def scan_docs_dir(user=Depends(get_admin_user)):
)
data
=
loader
.
load
()
try
:
result
=
store_data_in_vector_db
(
data
,
collection_name
)
if
result
:
...
...
@@ -561,6 +574,9 @@ def scan_docs_dir(user=Depends(get_admin_user)):
}
),
)
except
Exception
as
e
:
print
(
e
)
pass
except
Exception
as
e
:
log
.
exception
(
e
)
...
...
backend/constants.py
View file @
a1fc2f4d
...
...
@@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum):
MODEL_NOT_FOUND
=
lambda
name
=
""
:
f
"Model '
{
name
}
' was not found"
OPENAI_NOT_FOUND
=
lambda
name
=
""
:
f
"OpenAI API was not found"
OLLAMA_NOT_FOUND
=
"WebUI could not connect to Ollama"
EMPTY_CONTENT
=
"The content provided is empty. Please ensure that there is text or data present before proceeding."
src/lib/constants.ts
View file @
a1fc2f4d
...
...
@@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [
'
text/plain
'
,
'
text/csv
'
,
'
text/xml
'
,
'
text/html
'
,
'
text/x-python
'
,
'
text/css
'
,
'
application/vnd.openxmlformats-officedocument.wordprocessingml.document
'
,
...
...
@@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
'
h
'
,
'
c
'
,
'
cs
'
,
'
htm
'
,
'
html
'
,
'
sql
'
,
'
log
'
,
'
ini
'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment