Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
open-webui
Commits
3c1ea243
Unverified
Commit
3c1ea243
authored
Jul 01, 2024
by
Timothy Jaeryang Baek
Committed by
GitHub
Jul 01, 2024
Browse files
Merge pull request #3582 from nickovs/tika-document-text
feat: Support Tika for document text extraction
parents
62ba6a24
7aa35a37
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
181 additions
and
41 deletions
+181
-41
.dockerignore
.dockerignore
+2
-1
backend/apps/rag/main.py
backend/apps/rag/main.py
+107
-40
backend/config.py
backend/config.py
+16
-0
src/lib/apis/rag/index.ts
src/lib/apis/rag/index.ts
+6
-0
src/lib/components/admin/Settings/Documents.svelte
src/lib/components/admin/Settings/Documents.svelte
+50
-0
No files found.
.dockerignore
View file @
3c1ea243
...
...
@@ -10,7 +10,8 @@ node_modules
vite.config.js.timestamp-*
vite.config.ts.timestamp-*
__pycache__
.env
.idea
venv
_old
uploads
.ipynb_checkpoints
...
...
backend/apps/rag/main.py
View file @
3c1ea243
...
...
@@ -91,6 +91,8 @@ from config import (
SRC_LOG_LEVELS
,
UPLOAD_DIR
,
DOCS_DIR
,
TEXT_EXTRACTION_ENGINE
,
TIKA_SERVER_URL
,
RAG_TOP_K
,
RAG_RELEVANCE_THRESHOLD
,
RAG_EMBEDDING_ENGINE
,
...
...
@@ -146,6 +148,9 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)
app
.
state
.
config
.
TEXT_EXTRACTION_ENGINE
=
TEXT_EXTRACTION_ENGINE
app
.
state
.
config
.
TIKA_SERVER_URL
=
TIKA_SERVER_URL
app
.
state
.
config
.
CHUNK_SIZE
=
CHUNK_SIZE
app
.
state
.
config
.
CHUNK_OVERLAP
=
CHUNK_OVERLAP
...
...
@@ -390,6 +395,10 @@ async def get_rag_config(user=Depends(get_admin_user)):
return
{
"status"
:
True
,
"pdf_extract_images"
:
app
.
state
.
config
.
PDF_EXTRACT_IMAGES
,
"text_extraction"
:
{
"engine"
:
app
.
state
.
config
.
TEXT_EXTRACTION_ENGINE
,
"tika_server_url"
:
app
.
state
.
config
.
TIKA_SERVER_URL
,
},
"chunk"
:
{
"chunk_size"
:
app
.
state
.
config
.
CHUNK_SIZE
,
"chunk_overlap"
:
app
.
state
.
config
.
CHUNK_OVERLAP
,
...
...
@@ -419,6 +428,11 @@ async def get_rag_config(user=Depends(get_admin_user)):
}
class
TextExtractionConfig
(
BaseModel
):
engine
:
str
=
""
tika_server_url
:
Optional
[
str
]
=
None
class
ChunkParamUpdateForm
(
BaseModel
):
chunk_size
:
int
chunk_overlap
:
int
...
...
@@ -452,6 +466,7 @@ class WebConfig(BaseModel):
class
ConfigUpdateForm
(
BaseModel
):
pdf_extract_images
:
Optional
[
bool
]
=
None
text_extraction
:
Optional
[
TextExtractionConfig
]
=
None
chunk
:
Optional
[
ChunkParamUpdateForm
]
=
None
youtube
:
Optional
[
YoutubeLoaderConfig
]
=
None
web
:
Optional
[
WebConfig
]
=
None
...
...
@@ -465,6 +480,11 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
else
app
.
state
.
config
.
PDF_EXTRACT_IMAGES
)
if
form_data
.
text_extraction
is
not
None
:
log
.
info
(
f
"Updating text settings:
{
form_data
.
text_extraction
}
"
)
app
.
state
.
config
.
TEXT_EXTRACTION_ENGINE
=
form_data
.
text_extraction
.
engine
app
.
state
.
config
.
TIKA_SERVER_URL
=
form_data
.
text_extraction
.
tika_server_url
if
form_data
.
chunk
is
not
None
:
app
.
state
.
config
.
CHUNK_SIZE
=
form_data
.
chunk
.
chunk_size
app
.
state
.
config
.
CHUNK_OVERLAP
=
form_data
.
chunk
.
chunk_overlap
...
...
@@ -501,6 +521,10 @@ async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_
return
{
"status"
:
True
,
"pdf_extract_images"
:
app
.
state
.
config
.
PDF_EXTRACT_IMAGES
,
"text_extraction"
:
{
"engine"
:
app
.
state
.
config
.
TEXT_EXTRACTION_ENGINE
,
"tika_server_url"
:
app
.
state
.
config
.
TIKA_SERVER_URL
,
},
"chunk"
:
{
"chunk_size"
:
app
.
state
.
config
.
CHUNK_SIZE
,
"chunk_overlap"
:
app
.
state
.
config
.
CHUNK_OVERLAP
,
...
...
@@ -987,6 +1011,41 @@ def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> b
return
False
class
TikaLoader
:
def
__init__
(
self
,
file_path
,
mime_type
=
None
):
self
.
file_path
=
file_path
self
.
mime_type
=
mime_type
def
load
(
self
)
->
List
[
Document
]:
with
(
open
(
self
.
file_path
,
"rb"
)
as
f
):
data
=
f
.
read
()
if
self
.
mime_type
is
not
None
:
headers
=
{
"Content-Type"
:
self
.
mime_type
}
else
:
headers
=
{}
endpoint
=
app
.
state
.
config
.
TIKA_SERVER_URL
if
not
endpoint
.
endswith
(
"/"
):
endpoint
+=
"/"
endpoint
+=
"tika/text"
r
=
requests
.
put
(
endpoint
,
data
=
data
,
headers
=
headers
)
if
r
.
ok
:
raw_metadata
=
r
.
json
()
text
=
raw_metadata
.
get
(
"X-TIKA:content"
,
"<No text content found>"
)
if
"Content-Type"
in
raw_metadata
:
headers
[
"Content-Type"
]
=
raw_metadata
[
"Content-Type"
]
log
.
info
(
"Tika extracted text: %s"
,
text
)
return
[
Document
(
page_content
=
text
,
metadata
=
headers
)]
else
:
raise
Exception
(
f
"Error calling Tika:
{
r
.
reason
}
"
)
def
get_loader
(
filename
:
str
,
file_content_type
:
str
,
file_path
:
str
):
file_ext
=
filename
.
split
(
"."
)[
-
1
].
lower
()
known_type
=
True
...
...
@@ -1037,47 +1096,55 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
"msg"
,
]
if
file_ext
==
"pdf"
:
loader
=
PyPDFLoader
(
file_path
,
extract_images
=
app
.
state
.
config
.
PDF_EXTRACT_IMAGES
)
elif
file_ext
==
"csv"
:
loader
=
CSVLoader
(
file_path
)
elif
file_ext
==
"rst"
:
loader
=
UnstructuredRSTLoader
(
file_path
,
mode
=
"elements"
)
elif
file_ext
==
"xml"
:
loader
=
UnstructuredXMLLoader
(
file_path
)
elif
file_ext
in
[
"htm"
,
"html"
]:
loader
=
BSHTMLLoader
(
file_path
,
open_encoding
=
"unicode_escape"
)
elif
file_ext
==
"md"
:
loader
=
UnstructuredMarkdownLoader
(
file_path
)
elif
file_content_type
==
"application/epub+zip"
:
loader
=
UnstructuredEPubLoader
(
file_path
)
elif
(
file_content_type
==
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or
file_ext
in
[
"doc"
,
"docx"
]
):
loader
=
Docx2txtLoader
(
file_path
)
elif
file_content_type
in
[
"application/vnd.ms-excel"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
]
or
file_ext
in
[
"xls"
,
"xlsx"
]:
loader
=
UnstructuredExcelLoader
(
file_path
)
elif
file_content_type
in
[
"application/vnd.ms-powerpoint"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
]
or
file_ext
in
[
"ppt"
,
"pptx"
]:
loader
=
UnstructuredPowerPointLoader
(
file_path
)
elif
file_ext
==
"msg"
:
loader
=
OutlookMessageLoader
(
file_path
)
elif
file_ext
in
known_source_ext
or
(
file_content_type
and
file_content_type
.
find
(
"text/"
)
>=
0
):
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
if
app
.
state
.
config
.
TEXT_EXTRACTION_ENGINE
==
"tika"
and
app
.
state
.
config
.
TIKA_SERVER_URL
:
if
file_ext
in
known_source_ext
or
(
file_content_type
and
file_content_type
.
find
(
"text/"
)
>=
0
):
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
else
:
loader
=
TikaLoader
(
file_path
,
file_content_type
)
else
:
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
known_type
=
False
if
file_ext
==
"pdf"
:
loader
=
PyPDFLoader
(
file_path
,
extract_images
=
app
.
state
.
config
.
PDF_EXTRACT_IMAGES
)
elif
file_ext
==
"csv"
:
loader
=
CSVLoader
(
file_path
)
elif
file_ext
==
"rst"
:
loader
=
UnstructuredRSTLoader
(
file_path
,
mode
=
"elements"
)
elif
file_ext
==
"xml"
:
loader
=
UnstructuredXMLLoader
(
file_path
)
elif
file_ext
in
[
"htm"
,
"html"
]:
loader
=
BSHTMLLoader
(
file_path
,
open_encoding
=
"unicode_escape"
)
elif
file_ext
==
"md"
:
loader
=
UnstructuredMarkdownLoader
(
file_path
)
elif
file_content_type
==
"application/epub+zip"
:
loader
=
UnstructuredEPubLoader
(
file_path
)
elif
(
file_content_type
==
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or
file_ext
in
[
"doc"
,
"docx"
]
):
loader
=
Docx2txtLoader
(
file_path
)
elif
file_content_type
in
[
"application/vnd.ms-excel"
,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
,
]
or
file_ext
in
[
"xls"
,
"xlsx"
]:
loader
=
UnstructuredExcelLoader
(
file_path
)
elif
file_content_type
in
[
"application/vnd.ms-powerpoint"
,
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
,
]
or
file_ext
in
[
"ppt"
,
"pptx"
]:
loader
=
UnstructuredPowerPointLoader
(
file_path
)
elif
file_ext
==
"msg"
:
loader
=
OutlookMessageLoader
(
file_path
)
elif
file_ext
in
known_source_ext
or
(
file_content_type
and
file_content_type
.
find
(
"text/"
)
>=
0
):
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
else
:
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
known_type
=
False
return
loader
,
known_type
...
...
backend/config.py
View file @
3c1ea243
...
...
@@ -885,6 +885,22 @@ WEBUI_SESSION_COOKIE_SECURE = os.environ.get(
if
WEBUI_AUTH
and
WEBUI_SECRET_KEY
==
""
:
raise
ValueError
(
ERROR_MESSAGES
.
ENV_VAR_NOT_FOUND
)
####################################
# RAG document text extraction
####################################
TEXT_EXTRACTION_ENGINE
=
PersistentConfig
(
"TEXT_EXTRACTION_ENGINE"
,
"rag.text_extraction_engine"
,
os
.
environ
.
get
(
"TEXT_EXTRACTION_ENGINE"
,
""
).
lower
()
)
TIKA_SERVER_URL
=
PersistentConfig
(
"TIKA_SERVER_URL"
,
"rag.tika_server_url"
,
os
.
getenv
(
"TIKA_SERVER_URL"
,
"http://tika:9998"
),
# Default for sidecar deployment
)
####################################
# RAG
####################################
...
...
src/lib/apis/rag/index.ts
View file @
3c1ea243
...
...
@@ -32,6 +32,11 @@ type ChunkConfigForm = {
chunk_overlap
:
number
;
};
type
TextExtractConfigForm
=
{
engine
:
string
;
tika_server_url
:
string
|
null
;
};
type
YoutubeConfigForm
=
{
language
:
string
[];
translation
?:
string
|
null
;
...
...
@@ -40,6 +45,7 @@ type YoutubeConfigForm = {
type
RAGConfigForm
=
{
pdf_extract_images
?:
boolean
;
chunk
?:
ChunkConfigForm
;
text_extraction
?:
TextExtractConfigForm
;
web_loader_ssl_verification
?:
boolean
;
youtube
?:
YoutubeConfigForm
;
};
...
...
src/lib/components/admin/Settings/Documents.svelte
View file @
3c1ea243
...
...
@@ -37,6 +37,10 @@
let embeddingModel = '';
let rerankingModel = '';
let textExtractionEngine = 'default';
let tikaServerUrl = '';
let showTikaServerUrl = false;
let chunkSize = 0;
let chunkOverlap = 0;
let pdfExtractImages = true;
...
...
@@ -163,11 +167,20 @@
rerankingModelUpdateHandler();
}
if (textExtractionEngine === 'tika' && tikaServerUrl === '') {
toast.error($i18n.t('Tika Server URL required.'));
return;
}
const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
chunk: {
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
},
text_extraction: {
engine: textExtractionEngine,
tika_server_url: tikaServerUrl
}
});
...
...
@@ -213,6 +226,10 @@
chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;
textExtractionEngine = res.text_extraction.engine;
tikaServerUrl = res.text_extraction.tika_server_url;
showTikaServerUrl = textExtractionEngine === 'tika';
}
});
</script>
...
...
@@ -388,6 +405,39 @@
</div>
</div>
<hr class="dark:border-gray-850" />
<div class="">
<div class="text-sm font-medium">{$i18n.t('Text Extraction')}</div>
<div class="flex w-full justify-between mt-2">
<div class="self-center text-xs font-medium">{$i18n.t('Engine')}</div>
<div class="flex items-center relative">
<select
class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={textExtractionEngine}
on:change={(e) => {
showTikaServerUrl = (e.target.value === 'tika');
}}
>
<option value="default">{$i18n.t('Default')}</option>
<option value="tika">{$i18n.t('Tika')}</option>
</select>
</div>
</div>
{#if showTikaServerUrl}
<div class="flex w-full mt-2">
<div class="flex-1 mr-2">
<input
class="w-full rounded-lg py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
placeholder={$i18n.t('Enter Tika Server URL')}
bind:value={tikaServerUrl}
/>
</div>
</div>
{/if}
</div>
<hr class=" dark:border-gray-850 my-1" />
<div class="space-y-2" />
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment