main.py 48.1 KB
Newer Older
Timothy J. Baek's avatar
Timothy J. Baek committed
1
2
3
4
5
6
7
8
9
from fastapi import (
    FastAPI,
    Depends,
    HTTPException,
    status,
    UploadFile,
    File,
    Form,
)
Timothy J. Baek's avatar
Timothy J. Baek committed
10
from fastapi.middleware.cors import CORSMiddleware
Que Nguyen's avatar
Que Nguyen committed
11
import requests
12
import os, shutil, logging, re
mindspawn's avatar
mindspawn committed
13
from datetime import datetime
14
15

from pathlib import Path
Michael Poluektov's avatar
Michael Poluektov committed
16
from typing import Union, Sequence, Iterator, Any
Timothy J. Baek's avatar
Timothy J. Baek committed
17

18
from chromadb.utils.batch_utils import create_batches
19
from langchain_core.documents import Document
Timothy J. Baek's avatar
Timothy J. Baek committed
20

Timothy J. Baek's avatar
Timothy J. Baek committed
21
22
23
24
25
from langchain_community.document_loaders import (
    WebBaseLoader,
    TextLoader,
    PyPDFLoader,
    CSVLoader,
26
    BSHTMLLoader,
Timothy J. Baek's avatar
Timothy J. Baek committed
27
    Docx2txtLoader,
Dave Bauman's avatar
Dave Bauman committed
28
    UnstructuredEPubLoader,
Timothy J. Baek's avatar
Timothy J. Baek committed
29
30
    UnstructuredWordDocumentLoader,
    UnstructuredMarkdownLoader,
31
    UnstructuredXMLLoader,
Marclass's avatar
Marclass committed
32
    UnstructuredRSTLoader,
Marclass's avatar
Marclass committed
33
    UnstructuredExcelLoader,
Timothy J. Baek's avatar
Timothy J. Baek committed
34
    UnstructuredPowerPointLoader,
Timothy J. Baek's avatar
Timothy J. Baek committed
35
    YoutubeLoader,
mindspawn's avatar
mindspawn committed
36
    OutlookMessageLoader,
Timothy J. Baek's avatar
Timothy J. Baek committed
37
)
38
39
from langchain.text_splitter import RecursiveCharacterTextSplitter

40
41
42
43
44
import validators
import urllib.parse
import socket


45
46
from pydantic import BaseModel
from typing import Optional
47
import mimetypes
48
import uuid
49
50
import json

Timothy J. Baek's avatar
fix  
Timothy J. Baek committed
51
from apps.webui.models.documents import (
52
53
54
55
    Documents,
    DocumentForm,
    DocumentResponse,
)
Timothy J. Baek's avatar
Timothy J. Baek committed
56
57
58
from apps.webui.models.files import (
    Files,
)
Jannik Streidl's avatar
Jannik Streidl committed
59

60
from apps.rag.utils import (
61
    get_model_path,
Timothy J. Baek's avatar
Timothy J. Baek committed
62
63
64
65
66
    get_embedding_function,
    query_doc,
    query_doc_with_hybrid_search,
    query_collection,
    query_collection_with_hybrid_search,
67
)
Timothy J. Baek's avatar
Timothy J. Baek committed
68

Timothy J. Baek's avatar
Timothy J. Baek committed
69
70
71
72
73
74
from apps.rag.search.brave import search_brave
from apps.rag.search.google_pse import search_google_pse
from apps.rag.search.main import SearchResult
from apps.rag.search.searxng import search_searxng
from apps.rag.search.serper import search_serper
from apps.rag.search.serpstack import search_serpstack
75
from apps.rag.search.serply import search_serply
76
from apps.rag.search.duckduckgo import search_duckduckgo
77
from apps.rag.search.tavily import search_tavily
78
from apps.rag.search.jina_search import search_jina
Timothy J. Baek's avatar
Timothy J. Baek committed
79

80
81
82
83
84
85
from utils.misc import (
    calculate_sha256,
    calculate_sha256_string,
    sanitize_filename,
    extract_folders_after_data_docs,
)
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
86
from utils.utils import get_verified_user, get_admin_user
87

88
from config import (
89
    AppConfig,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
90
    ENV,
91
    SRC_LOG_LEVELS,
92
93
    UPLOAD_DIR,
    DOCS_DIR,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
94
    CONTENT_EXTRACTION_ENGINE,
95
    TIKA_SERVER_URL,
96
97
    RAG_TOP_K,
    RAG_RELEVANCE_THRESHOLD,
98
    RAG_EMBEDDING_ENGINE,
99
    RAG_EMBEDDING_MODEL,
100
    RAG_EMBEDDING_MODEL_AUTO_UPDATE,
101
    RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
Timothy J. Baek's avatar
Timothy J. Baek committed
102
    ENABLE_RAG_HYBRID_SEARCH,
103
    ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
Steven Kreitzer's avatar
Steven Kreitzer committed
104
    RAG_RERANKING_MODEL,
105
    PDF_EXTRACT_IMAGES,
106
    RAG_RERANKING_MODEL_AUTO_UPDATE,
Steven Kreitzer's avatar
Steven Kreitzer committed
107
    RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
Timothy J. Baek's avatar
Timothy J. Baek committed
108
109
    RAG_OPENAI_API_BASE_URL,
    RAG_OPENAI_API_KEY,
110
    DEVICE_TYPE,
111
112
113
    CHROMA_CLIENT,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
Timothy J. Baek's avatar
Timothy J. Baek committed
114
    RAG_TEMPLATE,
115
    ENABLE_RAG_LOCAL_WEB_FETCH,
116
    YOUTUBE_LOADER_LANGUAGE,
Timothy J. Baek's avatar
Timothy J. Baek committed
117
    ENABLE_RAG_WEB_SEARCH,
Timothy J. Baek's avatar
Timothy J. Baek committed
118
    RAG_WEB_SEARCH_ENGINE,
Que Nguyen's avatar
Que Nguyen committed
119
    RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
120
121
122
    SEARXNG_QUERY_URL,
    GOOGLE_PSE_API_KEY,
    GOOGLE_PSE_ENGINE_ID,
Timothy J. Baek's avatar
Timothy J. Baek committed
123
    BRAVE_SEARCH_API_KEY,
Timothy J. Baek's avatar
Timothy J. Baek committed
124
125
126
    SERPSTACK_API_KEY,
    SERPSTACK_HTTPS,
    SERPER_API_KEY,
127
    SERPLY_API_KEY,
128
    TAVILY_API_KEY,
Timothy J. Baek's avatar
Timothy J. Baek committed
129
    RAG_WEB_SEARCH_RESULT_COUNT,
130
    RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
131
    RAG_EMBEDDING_OPENAI_BATCH_SIZE,
132
)
133

134
135
from constants import ERROR_MESSAGES

136
137
138
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])

Timothy J. Baek's avatar
Timothy J. Baek committed
139
140
app = FastAPI()

141
app.state.config = AppConfig()
Timothy J. Baek's avatar
Timothy J. Baek committed
142

143
144
145
146
147
app.state.config.TOP_K = RAG_TOP_K
app.state.config.RELEVANCE_THRESHOLD = RAG_RELEVANCE_THRESHOLD

app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH
app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
148
149
    ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION
)
Steven Kreitzer's avatar
Steven Kreitzer committed
150

Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
151
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
152
153
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL

154
155
app.state.config.CHUNK_SIZE = CHUNK_SIZE
app.state.config.CHUNK_OVERLAP = CHUNK_OVERLAP
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
156

157
158
app.state.config.RAG_EMBEDDING_ENGINE = RAG_EMBEDDING_ENGINE
app.state.config.RAG_EMBEDDING_MODEL = RAG_EMBEDDING_MODEL
159
app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE = RAG_EMBEDDING_OPENAI_BATCH_SIZE
160
161
app.state.config.RAG_RERANKING_MODEL = RAG_RERANKING_MODEL
app.state.config.RAG_TEMPLATE = RAG_TEMPLATE
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
162

163

164
165
app.state.config.OPENAI_API_BASE_URL = RAG_OPENAI_API_BASE_URL
app.state.config.OPENAI_API_KEY = RAG_OPENAI_API_KEY
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
166

167
app.state.config.PDF_EXTRACT_IMAGES = PDF_EXTRACT_IMAGES
168

Steven Kreitzer's avatar
Steven Kreitzer committed
169

170
app.state.config.YOUTUBE_LOADER_LANGUAGE = YOUTUBE_LOADER_LANGUAGE
171
172
173
app.state.YOUTUBE_LOADER_TRANSLATION = None


Timothy J. Baek's avatar
Timothy J. Baek committed
174
app.state.config.ENABLE_RAG_WEB_SEARCH = ENABLE_RAG_WEB_SEARCH
Timothy J. Baek's avatar
Timothy J. Baek committed
175
app.state.config.RAG_WEB_SEARCH_ENGINE = RAG_WEB_SEARCH_ENGINE
Que Nguyen's avatar
Que Nguyen committed
176
app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST = RAG_WEB_SEARCH_DOMAIN_FILTER_LIST
Timothy J. Baek's avatar
Timothy J. Baek committed
177

Timothy J. Baek's avatar
Timothy J. Baek committed
178
179
180
app.state.config.SEARXNG_QUERY_URL = SEARXNG_QUERY_URL
app.state.config.GOOGLE_PSE_API_KEY = GOOGLE_PSE_API_KEY
app.state.config.GOOGLE_PSE_ENGINE_ID = GOOGLE_PSE_ENGINE_ID
Timothy J. Baek's avatar
Timothy J. Baek committed
181
app.state.config.BRAVE_SEARCH_API_KEY = BRAVE_SEARCH_API_KEY
Timothy J. Baek's avatar
Timothy J. Baek committed
182
183
184
app.state.config.SERPSTACK_API_KEY = SERPSTACK_API_KEY
app.state.config.SERPSTACK_HTTPS = SERPSTACK_HTTPS
app.state.config.SERPER_API_KEY = SERPER_API_KEY
185
app.state.config.SERPLY_API_KEY = SERPLY_API_KEY
186
app.state.config.TAVILY_API_KEY = TAVILY_API_KEY
Timothy J. Baek's avatar
Timothy J. Baek committed
187
188
189
190
app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = RAG_WEB_SEARCH_RESULT_COUNT
app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = RAG_WEB_SEARCH_CONCURRENT_REQUESTS


191
192
193
194
def update_embedding_model(
    embedding_model: str,
    update_model: bool = False,
):
195
    if embedding_model and app.state.config.RAG_EMBEDDING_ENGINE == "":
196
197
        import sentence_transformers

198
199
200
201
202
203
204
205
206
207
208
209
210
211
        app.state.sentence_transformer_ef = sentence_transformers.SentenceTransformer(
            get_model_path(embedding_model, update_model),
            device=DEVICE_TYPE,
            trust_remote_code=RAG_EMBEDDING_MODEL_TRUST_REMOTE_CODE,
        )
    else:
        app.state.sentence_transformer_ef = None


def update_reranking_model(
    reranking_model: str,
    update_model: bool = False,
):
    if reranking_model:
212
213
        import sentence_transformers

214
215
216
217
218
219
220
221
222
223
        app.state.sentence_transformer_rf = sentence_transformers.CrossEncoder(
            get_model_path(reranking_model, update_model),
            device=DEVICE_TYPE,
            trust_remote_code=RAG_RERANKING_MODEL_TRUST_REMOTE_CODE,
        )
    else:
        app.state.sentence_transformer_rf = None


update_embedding_model(
224
    app.state.config.RAG_EMBEDDING_MODEL,
225
226
227
228
    RAG_EMBEDDING_MODEL_AUTO_UPDATE,
)

update_reranking_model(
229
    app.state.config.RAG_RERANKING_MODEL,
230
231
    RAG_RERANKING_MODEL_AUTO_UPDATE,
)
Timothy J. Baek's avatar
Timothy J. Baek committed
232

Timothy J. Baek's avatar
Timothy J. Baek committed
233
234

app.state.EMBEDDING_FUNCTION = get_embedding_function(
235
236
    app.state.config.RAG_EMBEDDING_ENGINE,
    app.state.config.RAG_EMBEDDING_MODEL,
Timothy J. Baek's avatar
Timothy J. Baek committed
237
    app.state.sentence_transformer_ef,
238
239
    app.state.config.OPENAI_API_KEY,
    app.state.config.OPENAI_API_BASE_URL,
240
    app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
Timothy J. Baek's avatar
Timothy J. Baek committed
241
242
)

Timothy J. Baek's avatar
Timothy J. Baek committed
243
244
origins = ["*"]

245

Timothy J. Baek's avatar
Timothy J. Baek committed
246
247
248
249
250
251
252
253
254
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


Timothy J. Baek's avatar
Timothy J. Baek committed
255
class CollectionNameForm(BaseModel):
256
257
258
    collection_name: Optional[str] = "test"


Timothy J. Baek's avatar
Timothy J. Baek committed
259
class UrlForm(CollectionNameForm):
Timothy J. Baek's avatar
Timothy J. Baek committed
260
261
    url: str

Timothy J. Baek's avatar
Timothy J. Baek committed
262

263
264
265
266
class SearchForm(CollectionNameForm):
    query: str


Timothy J. Baek's avatar
Timothy J. Baek committed
267
268
@app.get("/")
async def get_status():
Timothy J. Baek's avatar
Timothy J. Baek committed
269
270
    return {
        "status": True,
271
272
273
274
275
276
        "chunk_size": app.state.config.CHUNK_SIZE,
        "chunk_overlap": app.state.config.CHUNK_OVERLAP,
        "template": app.state.config.RAG_TEMPLATE,
        "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE,
        "embedding_model": app.state.config.RAG_EMBEDDING_MODEL,
        "reranking_model": app.state.config.RAG_RERANKING_MODEL,
277
        "openai_batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
278
279
280
    }


Timothy J. Baek's avatar
Timothy J. Baek committed
281
282
@app.get("/embedding")
async def get_embedding_config(user=Depends(get_admin_user)):
283
284
    return {
        "status": True,
285
286
        "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE,
        "embedding_model": app.state.config.RAG_EMBEDDING_MODEL,
287
        "openai_config": {
288
289
            "url": app.state.config.OPENAI_API_BASE_URL,
            "key": app.state.config.OPENAI_API_KEY,
290
            "batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
291
        },
292
293
294
    }


Steven Kreitzer's avatar
Steven Kreitzer committed
295
296
@app.get("/reranking")
async def get_reraanking_config(user=Depends(get_admin_user)):
297
298
    return {
        "status": True,
299
        "reranking_model": app.state.config.RAG_RERANKING_MODEL,
300
    }
Steven Kreitzer's avatar
Steven Kreitzer committed
301
302


303
304
305
class OpenAIConfigForm(BaseModel):
    url: str
    key: str
306
    batch_size: Optional[int] = None
307
308


309
class EmbeddingModelUpdateForm(BaseModel):
310
    openai_config: Optional[OpenAIConfigForm] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
311
    embedding_engine: str
312
313
314
    embedding_model: str


Timothy J. Baek's avatar
Timothy J. Baek committed
315
316
@app.post("/embedding/update")
async def update_embedding_config(
317
318
    form_data: EmbeddingModelUpdateForm, user=Depends(get_admin_user)
):
Self Denial's avatar
Self Denial committed
319
    log.info(
320
        f"Updating embedding model: {app.state.config.RAG_EMBEDDING_MODEL} to {form_data.embedding_model}"
321
    )
322
    try:
323
324
        app.state.config.RAG_EMBEDDING_ENGINE = form_data.embedding_engine
        app.state.config.RAG_EMBEDDING_MODEL = form_data.embedding_model
Timothy J. Baek's avatar
Timothy J. Baek committed
325

326
        if app.state.config.RAG_EMBEDDING_ENGINE in ["ollama", "openai"]:
327
            if form_data.openai_config is not None:
328
329
                app.state.config.OPENAI_API_BASE_URL = form_data.openai_config.url
                app.state.config.OPENAI_API_KEY = form_data.openai_config.key
330
331
332
333
334
                app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE = (
                    form_data.openai_config.batch_size
                    if form_data.openai_config.batch_size
                    else 1
                )
335

Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
336
        update_embedding_model(app.state.config.RAG_EMBEDDING_MODEL)
337

Timothy J. Baek's avatar
Timothy J. Baek committed
338
        app.state.EMBEDDING_FUNCTION = get_embedding_function(
339
340
            app.state.config.RAG_EMBEDDING_ENGINE,
            app.state.config.RAG_EMBEDDING_MODEL,
Timothy J. Baek's avatar
Timothy J. Baek committed
341
            app.state.sentence_transformer_ef,
342
343
            app.state.config.OPENAI_API_KEY,
            app.state.config.OPENAI_API_BASE_URL,
344
            app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
Timothy J. Baek's avatar
Timothy J. Baek committed
345
346
        )

347
348
        return {
            "status": True,
349
350
            "embedding_engine": app.state.config.RAG_EMBEDDING_ENGINE,
            "embedding_model": app.state.config.RAG_EMBEDDING_MODEL,
351
            "openai_config": {
352
353
                "url": app.state.config.OPENAI_API_BASE_URL,
                "key": app.state.config.OPENAI_API_KEY,
354
                "batch_size": app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
355
            },
356
357
358
359
360
361
362
        }
    except Exception as e:
        log.exception(f"Problem updating embedding model: {e}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )
Timothy J. Baek's avatar
Timothy J. Baek committed
363
364


Steven Kreitzer's avatar
Steven Kreitzer committed
365
366
class RerankingModelUpdateForm(BaseModel):
    reranking_model: str
367

Steven Kreitzer's avatar
Steven Kreitzer committed
368
369
370
371
372
373

@app.post("/reranking/update")
async def update_reranking_config(
    form_data: RerankingModelUpdateForm, user=Depends(get_admin_user)
):
    log.info(
374
        f"Updating reranking model: {app.state.config.RAG_RERANKING_MODEL} to {form_data.reranking_model}"
Steven Kreitzer's avatar
Steven Kreitzer committed
375
376
    )
    try:
377
        app.state.config.RAG_RERANKING_MODEL = form_data.reranking_model
378

Alexandre GODARD's avatar
Alexandre GODARD committed
379
        update_reranking_model(app.state.config.RAG_RERANKING_MODEL, True)
Steven Kreitzer's avatar
Steven Kreitzer committed
380
381
382

        return {
            "status": True,
383
            "reranking_model": app.state.config.RAG_RERANKING_MODEL,
Steven Kreitzer's avatar
Steven Kreitzer committed
384
385
386
387
388
389
390
391
392
        }
    except Exception as e:
        log.exception(f"Problem updating reranking model: {e}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )


Timothy J. Baek's avatar
Timothy J. Baek committed
393
394
@app.get("/config")
async def get_rag_config(user=Depends(get_admin_user)):
Timothy J. Baek's avatar
Timothy J. Baek committed
395
396
    return {
        "status": True,
397
        "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
398
399
        "content_extraction": {
            "engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
400
401
            "tika_server_url": app.state.config.TIKA_SERVER_URL,
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
402
        "chunk": {
403
404
            "chunk_size": app.state.config.CHUNK_SIZE,
            "chunk_overlap": app.state.config.CHUNK_OVERLAP,
Timothy J. Baek's avatar
Timothy J. Baek committed
405
        },
406
        "youtube": {
407
            "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
408
409
            "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
410
        "web": {
Timothy J. Baek's avatar
Timothy J. Baek committed
411
            "ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
Timothy J. Baek's avatar
Timothy J. Baek committed
412
            "search": {
Timothy J. Baek's avatar
Timothy J. Baek committed
413
                "enabled": app.state.config.ENABLE_RAG_WEB_SEARCH,
Timothy J. Baek's avatar
Timothy J. Baek committed
414
                "engine": app.state.config.RAG_WEB_SEARCH_ENGINE,
Timothy J. Baek's avatar
Timothy J. Baek committed
415
416
417
                "searxng_query_url": app.state.config.SEARXNG_QUERY_URL,
                "google_pse_api_key": app.state.config.GOOGLE_PSE_API_KEY,
                "google_pse_engine_id": app.state.config.GOOGLE_PSE_ENGINE_ID,
Timothy J. Baek's avatar
Timothy J. Baek committed
418
                "brave_search_api_key": app.state.config.BRAVE_SEARCH_API_KEY,
Timothy J. Baek's avatar
Timothy J. Baek committed
419
420
421
                "serpstack_api_key": app.state.config.SERPSTACK_API_KEY,
                "serpstack_https": app.state.config.SERPSTACK_HTTPS,
                "serper_api_key": app.state.config.SERPER_API_KEY,
422
                "serply_api_key": app.state.config.SERPLY_API_KEY,
423
                "tavily_api_key": app.state.config.TAVILY_API_KEY,
Timothy J. Baek's avatar
Timothy J. Baek committed
424
425
                "result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
                "concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
Timothy J. Baek's avatar
Timothy J. Baek committed
426
            },
Timothy J. Baek's avatar
Timothy J. Baek committed
427
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
428
429
430
    }


Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
431
class ContentExtractionConfig(BaseModel):
432
433
434
435
    engine: str = ""
    tika_server_url: Optional[str] = None


Timothy J. Baek's avatar
Timothy J. Baek committed
436
437
438
439
440
class ChunkParamUpdateForm(BaseModel):
    chunk_size: int
    chunk_overlap: int


441
class YoutubeLoaderConfig(BaseModel):
Michael Poluektov's avatar
Michael Poluektov committed
442
    language: list[str]
443
444
445
    translation: Optional[str] = None


Timothy J. Baek's avatar
Timothy J. Baek committed
446
class WebSearchConfig(BaseModel):
Timothy J. Baek's avatar
Timothy J. Baek committed
447
    enabled: bool
Timothy J. Baek's avatar
Timothy J. Baek committed
448
    engine: Optional[str] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
449
450
451
    searxng_query_url: Optional[str] = None
    google_pse_api_key: Optional[str] = None
    google_pse_engine_id: Optional[str] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
452
    brave_search_api_key: Optional[str] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
453
454
455
    serpstack_api_key: Optional[str] = None
    serpstack_https: Optional[bool] = None
    serper_api_key: Optional[str] = None
456
    serply_api_key: Optional[str] = None
457
    tavily_api_key: Optional[str] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
458
459
460
461
    result_count: Optional[int] = None
    concurrent_requests: Optional[int] = None


Timothy J. Baek's avatar
Timothy J. Baek committed
462
463
464
465
466
class WebConfig(BaseModel):
    search: WebSearchConfig
    web_loader_ssl_verification: Optional[bool] = None


Timothy J. Baek's avatar
Timothy J. Baek committed
467
class ConfigUpdateForm(BaseModel):
468
    pdf_extract_images: Optional[bool] = None
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
469
    content_extraction: Optional[ContentExtractionConfig] = None
470
    chunk: Optional[ChunkParamUpdateForm] = None
471
    youtube: Optional[YoutubeLoaderConfig] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
472
    web: Optional[WebConfig] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
473
474
475
476


@app.post("/config/update")
async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
477
    app.state.config.PDF_EXTRACT_IMAGES = (
478
        form_data.pdf_extract_images
479
480
        if form_data.pdf_extract_images is not None
        else app.state.config.PDF_EXTRACT_IMAGES
481
482
    )

Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
483
484
485
486
    if form_data.content_extraction is not None:
        log.info(f"Updating text settings: {form_data.content_extraction}")
        app.state.config.CONTENT_EXTRACTION_ENGINE = form_data.content_extraction.engine
        app.state.config.TIKA_SERVER_URL = form_data.content_extraction.tika_server_url
487

Timothy J. Baek's avatar
Timothy J. Baek committed
488
489
490
    if form_data.chunk is not None:
        app.state.config.CHUNK_SIZE = form_data.chunk.chunk_size
        app.state.config.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
491

Timothy J. Baek's avatar
Timothy J. Baek committed
492
493
494
    if form_data.youtube is not None:
        app.state.config.YOUTUBE_LOADER_LANGUAGE = form_data.youtube.language
        app.state.YOUTUBE_LOADER_TRANSLATION = form_data.youtube.translation
495

Timothy J. Baek's avatar
Timothy J. Baek committed
496
497
498
499
    if form_data.web is not None:
        app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
            form_data.web.web_loader_ssl_verification
        )
500

Timothy J. Baek's avatar
Timothy J. Baek committed
501
        app.state.config.ENABLE_RAG_WEB_SEARCH = form_data.web.search.enabled
Timothy J. Baek's avatar
Timothy J. Baek committed
502
503
504
505
506
507
508
509
510
511
512
513
        app.state.config.RAG_WEB_SEARCH_ENGINE = form_data.web.search.engine
        app.state.config.SEARXNG_QUERY_URL = form_data.web.search.searxng_query_url
        app.state.config.GOOGLE_PSE_API_KEY = form_data.web.search.google_pse_api_key
        app.state.config.GOOGLE_PSE_ENGINE_ID = (
            form_data.web.search.google_pse_engine_id
        )
        app.state.config.BRAVE_SEARCH_API_KEY = (
            form_data.web.search.brave_search_api_key
        )
        app.state.config.SERPSTACK_API_KEY = form_data.web.search.serpstack_api_key
        app.state.config.SERPSTACK_HTTPS = form_data.web.search.serpstack_https
        app.state.config.SERPER_API_KEY = form_data.web.search.serper_api_key
514
        app.state.config.SERPLY_API_KEY = form_data.web.search.serply_api_key
515
        app.state.config.TAVILY_API_KEY = form_data.web.search.tavily_api_key
Timothy J. Baek's avatar
Timothy J. Baek committed
516
517
518
519
        app.state.config.RAG_WEB_SEARCH_RESULT_COUNT = form_data.web.search.result_count
        app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS = (
            form_data.web.search.concurrent_requests
        )
520

Timothy J. Baek's avatar
Timothy J. Baek committed
521
522
    return {
        "status": True,
523
        "pdf_extract_images": app.state.config.PDF_EXTRACT_IMAGES,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
524
525
        "content_extraction": {
            "engine": app.state.config.CONTENT_EXTRACTION_ENGINE,
526
527
            "tika_server_url": app.state.config.TIKA_SERVER_URL,
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
528
        "chunk": {
529
530
            "chunk_size": app.state.config.CHUNK_SIZE,
            "chunk_overlap": app.state.config.CHUNK_OVERLAP,
Timothy J. Baek's avatar
Timothy J. Baek committed
531
        },
532
        "youtube": {
533
            "language": app.state.config.YOUTUBE_LOADER_LANGUAGE,
534
535
            "translation": app.state.YOUTUBE_LOADER_TRANSLATION,
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
536
537
538
        "web": {
            "ssl_verification": app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
            "search": {
Timothy J. Baek's avatar
Timothy J. Baek committed
539
                "enabled": app.state.config.ENABLE_RAG_WEB_SEARCH,
Timothy J. Baek's avatar
Timothy J. Baek committed
540
541
542
543
544
545
546
547
                "engine": app.state.config.RAG_WEB_SEARCH_ENGINE,
                "searxng_query_url": app.state.config.SEARXNG_QUERY_URL,
                "google_pse_api_key": app.state.config.GOOGLE_PSE_API_KEY,
                "google_pse_engine_id": app.state.config.GOOGLE_PSE_ENGINE_ID,
                "brave_search_api_key": app.state.config.BRAVE_SEARCH_API_KEY,
                "serpstack_api_key": app.state.config.SERPSTACK_API_KEY,
                "serpstack_https": app.state.config.SERPSTACK_HTTPS,
                "serper_api_key": app.state.config.SERPER_API_KEY,
548
                "serply_api_key": app.state.config.SERPLY_API_KEY,
549
                "tavily_api_key": app.state.config.TAVILY_API_KEY,
Timothy J. Baek's avatar
Timothy J. Baek committed
550
551
552
553
                "result_count": app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
                "concurrent_requests": app.state.config.RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
            },
        },
Timothy J. Baek's avatar
Timothy J. Baek committed
554
    }
555
556


Timothy J. Baek's avatar
Timothy J. Baek committed
557
@app.get("/template")
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
558
async def get_rag_template(user=Depends(get_verified_user)):
Timothy J. Baek's avatar
Timothy J. Baek committed
559
560
    return {
        "status": True,
561
        "template": app.state.config.RAG_TEMPLATE,
Timothy J. Baek's avatar
Timothy J. Baek committed
562
563
564
    }


565
566
567
568
@app.get("/query/settings")
async def get_query_settings(user=Depends(get_admin_user)):
    return {
        "status": True,
569
570
571
572
        "template": app.state.config.RAG_TEMPLATE,
        "k": app.state.config.TOP_K,
        "r": app.state.config.RELEVANCE_THRESHOLD,
        "hybrid": app.state.config.ENABLE_RAG_HYBRID_SEARCH,
573
    }
Timothy J. Baek's avatar
Timothy J. Baek committed
574
575


576
577
class QuerySettingsForm(BaseModel):
    k: Optional[int] = None
578
    r: Optional[float] = None
579
    template: Optional[str] = None
Steven Kreitzer's avatar
Steven Kreitzer committed
580
    hybrid: Optional[bool] = None
581
582
583
584
585
586


@app.post("/query/settings/update")
async def update_query_settings(
    form_data: QuerySettingsForm, user=Depends(get_admin_user)
):
587
    app.state.config.RAG_TEMPLATE = (
Timothy J. Baek's avatar
Timothy J. Baek committed
588
        form_data.template if form_data.template else RAG_TEMPLATE
589
    )
590
591
592
    app.state.config.TOP_K = form_data.k if form_data.k else 4
    app.state.config.RELEVANCE_THRESHOLD = form_data.r if form_data.r else 0.0
    app.state.config.ENABLE_RAG_HYBRID_SEARCH = (
Timothy J. Baek's avatar
Timothy J. Baek committed
593
        form_data.hybrid if form_data.hybrid else False
594
    )
Steven Kreitzer's avatar
Steven Kreitzer committed
595
596
    return {
        "status": True,
597
598
599
600
        "template": app.state.config.RAG_TEMPLATE,
        "k": app.state.config.TOP_K,
        "r": app.state.config.RELEVANCE_THRESHOLD,
        "hybrid": app.state.config.ENABLE_RAG_HYBRID_SEARCH,
Steven Kreitzer's avatar
Steven Kreitzer committed
601
    }
602
603


604
class QueryDocForm(BaseModel):
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
605
606
    collection_name: str
    query: str
607
    k: Optional[int] = None
608
    r: Optional[float] = None
Steven Kreitzer's avatar
Steven Kreitzer committed
609
    hybrid: Optional[bool] = None
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
610
611


612
@app.post("/query/doc")
Timothy J. Baek's avatar
Timothy J. Baek committed
613
def query_doc_handler(
614
    form_data: QueryDocForm,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
615
    user=Depends(get_verified_user),
Timothy J. Baek's avatar
Timothy J. Baek committed
616
):
617
    try:
618
        if app.state.config.ENABLE_RAG_HYBRID_SEARCH:
Timothy J. Baek's avatar
Timothy J. Baek committed
619
620
621
            return query_doc_with_hybrid_search(
                collection_name=form_data.collection_name,
                query=form_data.query,
Steven Kreitzer's avatar
Steven Kreitzer committed
622
                embedding_function=app.state.EMBEDDING_FUNCTION,
623
                k=form_data.k if form_data.k else app.state.config.TOP_K,
Steven Kreitzer's avatar
Steven Kreitzer committed
624
                reranking_function=app.state.sentence_transformer_rf,
625
                r=(
626
                    form_data.r if form_data.r else app.state.config.RELEVANCE_THRESHOLD
627
                ),
Timothy J. Baek's avatar
Timothy J. Baek committed
628
629
630
631
632
            )
        else:
            return query_doc(
                collection_name=form_data.collection_name,
                query=form_data.query,
Steven Kreitzer's avatar
Steven Kreitzer committed
633
                embedding_function=app.state.EMBEDDING_FUNCTION,
634
                k=form_data.k if form_data.k else app.state.config.TOP_K,
Timothy J. Baek's avatar
Timothy J. Baek committed
635
            )
636
    except Exception as e:
637
        log.exception(e)
638
639
640
641
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )
642
643


Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
644
class QueryCollectionsForm(BaseModel):
Michael Poluektov's avatar
Michael Poluektov committed
645
    collection_names: list[str]
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
646
    query: str
647
    k: Optional[int] = None
648
    r: Optional[float] = None
Steven Kreitzer's avatar
Steven Kreitzer committed
649
    hybrid: Optional[bool] = None
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
650
651


652
@app.post("/query/collection")
Timothy J. Baek's avatar
Timothy J. Baek committed
653
def query_collection_handler(
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
654
    form_data: QueryCollectionsForm,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
655
    user=Depends(get_verified_user),
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
656
):
657
    try:
658
        if app.state.config.ENABLE_RAG_HYBRID_SEARCH:
Timothy J. Baek's avatar
Timothy J. Baek committed
659
660
661
            return query_collection_with_hybrid_search(
                collection_names=form_data.collection_names,
                query=form_data.query,
Steven Kreitzer's avatar
Steven Kreitzer committed
662
                embedding_function=app.state.EMBEDDING_FUNCTION,
663
                k=form_data.k if form_data.k else app.state.config.TOP_K,
Steven Kreitzer's avatar
Steven Kreitzer committed
664
                reranking_function=app.state.sentence_transformer_rf,
665
                r=(
666
                    form_data.r if form_data.r else app.state.config.RELEVANCE_THRESHOLD
667
                ),
Timothy J. Baek's avatar
Timothy J. Baek committed
668
669
670
671
672
            )
        else:
            return query_collection(
                collection_names=form_data.collection_names,
                query=form_data.query,
Steven Kreitzer's avatar
Steven Kreitzer committed
673
                embedding_function=app.state.EMBEDDING_FUNCTION,
674
                k=form_data.k if form_data.k else app.state.config.TOP_K,
Timothy J. Baek's avatar
Timothy J. Baek committed
675
            )
676

677
678
679
680
681
682
    except Exception as e:
        log.exception(e)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
683
684


Timothy J. Baek's avatar
Timothy J. Baek committed
685
@app.post("/youtube")
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
686
def store_youtube_video(form_data: UrlForm, user=Depends(get_verified_user)):
Timothy J. Baek's avatar
Timothy J. Baek committed
687
    try:
688
689
690
        loader = YoutubeLoader.from_youtube_url(
            form_data.url,
            add_video_info=True,
691
            language=app.state.config.YOUTUBE_LOADER_LANGUAGE,
692
693
            translation=app.state.YOUTUBE_LOADER_TRANSLATION,
        )
Timothy J. Baek's avatar
Timothy J. Baek committed
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
        data = loader.load()

        collection_name = form_data.collection_name
        if collection_name == "":
            collection_name = calculate_sha256_string(form_data.url)[:63]

        store_data_in_vector_db(data, collection_name, overwrite=True)
        return {
            "status": True,
            "collection_name": collection_name,
            "filename": form_data.url,
        }
    except Exception as e:
        log.exception(e)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )


714
@app.post("/web")
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
715
def store_web(form_data: UrlForm, user=Depends(get_verified_user)):
716
717
    # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
    try:
718
        loader = get_web_loader(
719
            form_data.url,
720
            verify_ssl=app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION,
721
        )
722
        data = loader.load()
Timothy J. Baek's avatar
Timothy J. Baek committed
723
724
725
726
727

        collection_name = form_data.collection_name
        if collection_name == "":
            collection_name = calculate_sha256_string(form_data.url)[:63]

728
        store_data_in_vector_db(data, collection_name, overwrite=True)
Timothy J. Baek's avatar
Timothy J. Baek committed
729
730
        return {
            "status": True,
Timothy J. Baek's avatar
Timothy J. Baek committed
731
            "collection_name": collection_name,
Timothy J. Baek's avatar
Timothy J. Baek committed
732
733
            "filename": form_data.url,
        }
734
    except Exception as e:
735
        log.exception(e)
736
737
738
739
740
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )

741

742
def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True):
743
    # Check if the URL is valid
744
    if not validate_url(url):
745
        raise ValueError(ERROR_MESSAGES.INVALID_URL)
746
    return SafeWebBaseLoader(
747
748
749
        url,
        verify_ssl=verify_ssl,
        requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
750
        continue_on_failure=True,
751
    )
752
753


754
755
756
757
def validate_url(url: Union[str, Sequence[str]]):
    if isinstance(url, str):
        if isinstance(validators.url(url), validators.ValidationError):
            raise ValueError(ERROR_MESSAGES.INVALID_URL)
758
        if not ENABLE_RAG_LOCAL_WEB_FETCH:
Timothy J. Baek's avatar
revert  
Timothy J. Baek committed
759
760
761
762
763
764
765
766
767
768
769
            # Local web fetch is disabled, filter out any URLs that resolve to private IP addresses
            parsed_url = urllib.parse.urlparse(url)
            # Get IPv4 and IPv6 addresses
            ipv4_addresses, ipv6_addresses = resolve_hostname(parsed_url.hostname)
            # Check if any of the resolved addresses are private
            # This is technically still vulnerable to DNS rebinding attacks, as we don't control WebBaseLoader
            for ip in ipv4_addresses:
                if validators.ipv4(ip, private=True):
                    raise ValueError(ERROR_MESSAGES.INVALID_URL)
            for ip in ipv6_addresses:
                if validators.ipv6(ip, private=True):
770
771
772
773
774
775
776
                    raise ValueError(ERROR_MESSAGES.INVALID_URL)
        return True
    elif isinstance(url, Sequence):
        return all(validate_url(u) for u in url)
    else:
        return False

Timothy J. Baek's avatar
Timothy J. Baek committed
777

Timothy J. Baek's avatar
revert  
Timothy J. Baek committed
778
779
780
781
782
783
784
785
786
787
788
def resolve_hostname(hostname):
    # Get address information
    addr_info = socket.getaddrinfo(hostname, None)

    # Extract IP addresses from address information
    ipv4_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET]
    ipv6_addresses = [info[4][0] for info in addr_info if info[0] == socket.AF_INET6]

    return ipv4_addresses, ipv6_addresses


Timothy J. Baek's avatar
Timothy J. Baek committed
789
790
791
792
793
794
795
796
def search_web(engine: str, query: str) -> list[SearchResult]:
    """Search the web using a search engine and return the results as a list of SearchResult objects.
    Will look for a search engine API key in environment variables in the following order:
    - SEARXNG_QUERY_URL
    - GOOGLE_PSE_API_KEY + GOOGLE_PSE_ENGINE_ID
    - BRAVE_SEARCH_API_KEY
    - SERPSTACK_API_KEY
    - SERPER_API_KEY
797
    - SERPLY_API_KEY
798
    - TAVILY_API_KEY
Timothy J. Baek's avatar
Timothy J. Baek committed
799
800
801
802
803
804
805
    Args:
        query (str): The query to search for
    """

    # TODO: add playwright to search the web
    if engine == "searxng":
        if app.state.config.SEARXNG_QUERY_URL:
Timothy J. Baek's avatar
Timothy J. Baek committed
806
807
808
809
            return search_searxng(
                app.state.config.SEARXNG_QUERY_URL,
                query,
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Timothy J. Baek's avatar
Timothy J. Baek committed
810
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
811
            )
Timothy J. Baek's avatar
Timothy J. Baek committed
812
813
814
815
816
817
818
819
820
821
822
        else:
            raise Exception("No SEARXNG_QUERY_URL found in environment variables")
    elif engine == "google_pse":
        if (
            app.state.config.GOOGLE_PSE_API_KEY
            and app.state.config.GOOGLE_PSE_ENGINE_ID
        ):
            return search_google_pse(
                app.state.config.GOOGLE_PSE_API_KEY,
                app.state.config.GOOGLE_PSE_ENGINE_ID,
                query,
Timothy J. Baek's avatar
Timothy J. Baek committed
823
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Timothy J. Baek's avatar
Timothy J. Baek committed
824
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
825
826
827
828
829
830
831
            )
        else:
            raise Exception(
                "No GOOGLE_PSE_API_KEY or GOOGLE_PSE_ENGINE_ID found in environment variables"
            )
    elif engine == "brave":
        if app.state.config.BRAVE_SEARCH_API_KEY:
Timothy J. Baek's avatar
Timothy J. Baek committed
832
833
834
835
            return search_brave(
                app.state.config.BRAVE_SEARCH_API_KEY,
                query,
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Timothy J. Baek's avatar
Timothy J. Baek committed
836
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
837
            )
Timothy J. Baek's avatar
Timothy J. Baek committed
838
839
840
841
842
843
844
        else:
            raise Exception("No BRAVE_SEARCH_API_KEY found in environment variables")
    elif engine == "serpstack":
        if app.state.config.SERPSTACK_API_KEY:
            return search_serpstack(
                app.state.config.SERPSTACK_API_KEY,
                query,
Timothy J. Baek's avatar
Timothy J. Baek committed
845
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Que Nguyen's avatar
Que Nguyen committed
846
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
847
848
849
850
851
852
                https_enabled=app.state.config.SERPSTACK_HTTPS,
            )
        else:
            raise Exception("No SERPSTACK_API_KEY found in environment variables")
    elif engine == "serper":
        if app.state.config.SERPER_API_KEY:
Timothy J. Baek's avatar
Timothy J. Baek committed
853
854
855
856
            return search_serper(
                app.state.config.SERPER_API_KEY,
                query,
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Timothy J. Baek's avatar
Timothy J. Baek committed
857
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
Timothy J. Baek's avatar
Timothy J. Baek committed
858
            )
Timothy J. Baek's avatar
Timothy J. Baek committed
859
860
        else:
            raise Exception("No SERPER_API_KEY found in environment variables")
861
862
863
864
865
866
    elif engine == "serply":
        if app.state.config.SERPLY_API_KEY:
            return search_serply(
                app.state.config.SERPLY_API_KEY,
                query,
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
Timothy J. Baek's avatar
Timothy J. Baek committed
867
                app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
868
869
870
            )
        else:
            raise Exception("No SERPLY_API_KEY found in environment variables")
871
    elif engine == "duckduckgo":
Timothy J. Baek's avatar
Timothy J. Baek committed
872
873
874
875
876
        return search_duckduckgo(
            query,
            app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
            app.state.config.RAG_WEB_SEARCH_DOMAIN_FILTER_LIST,
        )
877
878
879
880
881
882
883
884
885
    elif engine == "tavily":
        if app.state.config.TAVILY_API_KEY:
            return search_tavily(
                app.state.config.TAVILY_API_KEY,
                query,
                app.state.config.RAG_WEB_SEARCH_RESULT_COUNT,
            )
        else:
            raise Exception("No TAVILY_API_KEY found in environment variables")
886
887
    elif engine == "jina":
        return search_jina(query, app.state.config.RAG_WEB_SEARCH_RESULT_COUNT)
Timothy J. Baek's avatar
Timothy J. Baek committed
888
889
890
891
    else:
        raise Exception("No search engine API key found in environment variables")


Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
892
@app.post("/web/search")
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
893
def store_web_search(form_data: SearchForm, user=Depends(get_verified_user)):
894
    try:
Timothy J. Baek's avatar
Timothy J. Baek committed
895
896
897
        logging.info(
            f"trying to web search with {app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query}"
        )
Timothy J. Baek's avatar
Timothy J. Baek committed
898
899
900
901
902
903
904
905
906
907
908
909
910
        web_results = search_web(
            app.state.config.RAG_WEB_SEARCH_ENGINE, form_data.query
        )
    except Exception as e:
        log.exception(e)

        print(e)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.WEB_SEARCH_ERROR(e),
        )

    try:
911
912
        urls = [result.link for result in web_results]
        loader = get_web_loader(urls)
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
913
        data = loader.load()
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932

        collection_name = form_data.collection_name
        if collection_name == "":
            collection_name = calculate_sha256_string(form_data.query)[:63]

        store_data_in_vector_db(data, collection_name, overwrite=True)
        return {
            "status": True,
            "collection_name": collection_name,
            "filenames": urls,
        }
    except Exception as e:
        log.exception(e)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )


933
934
935
def store_data_in_vector_db(
    data, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
) -> bool:
Timothy J. Baek's avatar
Timothy J. Baek committed
936

937
    text_splitter = RecursiveCharacterTextSplitter(
938
939
        chunk_size=app.state.config.CHUNK_SIZE,
        chunk_overlap=app.state.config.CHUNK_OVERLAP,
940
941
        add_start_index=True,
    )
942

943
    docs = text_splitter.split_documents(data)
Timothy J. Baek's avatar
Timothy J. Baek committed
944
945

    if len(docs) > 0:
946
        log.info(f"store_data_in_vector_db {docs}")
947
        return store_docs_in_vector_db(docs, collection_name, metadata, overwrite), None
Timothy J. Baek's avatar
Timothy J. Baek committed
948
949
    else:
        raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
950
951
952


def store_text_in_vector_db(
Timothy J. Baek's avatar
Timothy J. Baek committed
953
    text, metadata, collection_name, overwrite: bool = False
954
955
) -> bool:
    text_splitter = RecursiveCharacterTextSplitter(
956
957
        chunk_size=app.state.config.CHUNK_SIZE,
        chunk_overlap=app.state.config.CHUNK_OVERLAP,
958
959
        add_start_index=True,
    )
Timothy J. Baek's avatar
Timothy J. Baek committed
960
    docs = text_splitter.create_documents([text], metadatas=[metadata])
961
    return store_docs_in_vector_db(docs, collection_name, overwrite=overwrite)
962
963


964
965
966
def store_docs_in_vector_db(
    docs, collection_name, metadata: Optional[dict] = None, overwrite: bool = False
) -> bool:
967
    log.info(f"store_docs_in_vector_db {docs} {collection_name}")
Timothy J. Baek's avatar
Timothy J. Baek committed
968

969
    texts = [doc.page_content for doc in docs]
970
    metadatas = [{**doc.metadata, **(metadata if metadata else {})} for doc in docs]
971

mindspawn's avatar
mindspawn committed
972
973
974
975
976
977
978
    # ChromaDB does not like datetime formats
    # for meta-data so convert them to string.
    for metadata in metadatas:
        for key, value in metadata.items():
            if isinstance(value, datetime):
                metadata[key] = str(value)

979
980
981
982
    try:
        if overwrite:
            for collection in CHROMA_CLIENT.list_collections():
                if collection_name == collection.name:
983
                    log.info(f"deleting existing collection {collection_name}")
984
985
                    CHROMA_CLIENT.delete_collection(name=collection_name)

986
        collection = CHROMA_CLIENT.create_collection(name=collection_name)
987

Timothy J. Baek's avatar
Timothy J. Baek committed
988
        embedding_func = get_embedding_function(
989
990
            app.state.config.RAG_EMBEDDING_ENGINE,
            app.state.config.RAG_EMBEDDING_MODEL,
Steven Kreitzer's avatar
Steven Kreitzer committed
991
            app.state.sentence_transformer_ef,
992
993
            app.state.config.OPENAI_API_KEY,
            app.state.config.OPENAI_API_BASE_URL,
994
            app.state.config.RAG_EMBEDDING_OPENAI_BATCH_SIZE,
Steven Kreitzer's avatar
Steven Kreitzer committed
995
996
997
        )

        embedding_texts = list(map(lambda x: x.replace("\n", " "), texts))
998
        embeddings = embedding_func(embedding_texts)
999
1000
1001

        for batch in create_batches(
            api=CHROMA_CLIENT,
1002
            ids=[str(uuid.uuid4()) for _ in texts],
1003
1004
1005
1006
1007
            metadatas=metadatas,
            embeddings=embeddings,
            documents=texts,
        ):
            collection.add(*batch)
1008

1009
        return True
1010
1011
1012
1013
    except Exception as e:
        if e.__class__.__name__ == "UniqueConstraintError":
            return True

1014
1015
        log.exception(e)

1016
1017
1018
        return False


1019
1020
1021
1022
1023
class TikaLoader:
    def __init__(self, file_path, mime_type=None):
        self.file_path = file_path
        self.mime_type = mime_type

Michael Poluektov's avatar
Michael Poluektov committed
1024
    def load(self) -> list[Document]:
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1025
        with open(self.file_path, "rb") as f:
1026
1027
1028
1029
1030
1031
1032
            data = f.read()

        if self.mime_type is not None:
            headers = {"Content-Type": self.mime_type}
        else:
            headers = {}

1033
        endpoint = app.state.config.TIKA_SERVER_URL
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
        if not endpoint.endswith("/"):
            endpoint += "/"
        endpoint += "tika/text"

        r = requests.put(endpoint, data=data, headers=headers)

        if r.ok:
            raw_metadata = r.json()
            text = raw_metadata.get("X-TIKA:content", "<No text content found>")

            if "Content-Type" in raw_metadata:
                headers["Content-Type"] = raw_metadata["Content-Type"]

            log.info("Tika extracted text: %s", text)

            return [Document(page_content=text, metadata=headers)]
        else:
            raise Exception(f"Error calling Tika: {r.reason}")


1054
1055
def get_loader(filename: str, file_content_type: str, file_path: str):
    file_ext = filename.split(".")[-1].lower()
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
    known_type = True

    known_source_ext = [
        "go",
        "py",
        "java",
        "sh",
        "bat",
        "ps1",
        "cmd",
        "js",
        "ts",
        "css",
        "cpp",
        "hpp",
        "h",
        "c",
        "cs",
        "sql",
        "log",
        "ini",
        "pl",
        "pm",
        "r",
        "dart",
        "dockerfile",
        "env",
        "php",
        "hs",
        "hsc",
        "lua",
        "nginxconf",
        "conf",
        "m",
        "mm",
        "plsql",
        "perl",
        "rb",
        "rs",
        "db2",
        "scala",
        "bash",
        "swift",
        "vue",
        "svelte",
mindspawn's avatar
mindspawn committed
1101
        "msg",
1102
1103
1104
1105
1106
1107
1108
        "ex",
        "exs",
        "erl",
        "tsx",
        "jsx",
        "hs",
        "lhs",
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1109
1110
    ]

Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1111
1112
1113
1114
    if (
        app.state.config.CONTENT_EXTRACTION_ENGINE == "tika"
        and app.state.config.TIKA_SERVER_URL
    ):
1115
        if file_ext in known_source_ext or (
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1116
            file_content_type and file_content_type.find("text/") >= 0
1117
1118
1119
1120
        ):
            loader = TextLoader(file_path, autodetect_encoding=True)
        else:
            loader = TikaLoader(file_path, file_content_type)
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1121
    else:
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
        if file_ext == "pdf":
            loader = PyPDFLoader(
                file_path, extract_images=app.state.config.PDF_EXTRACT_IMAGES
            )
        elif file_ext == "csv":
            loader = CSVLoader(file_path)
        elif file_ext == "rst":
            loader = UnstructuredRSTLoader(file_path, mode="elements")
        elif file_ext == "xml":
            loader = UnstructuredXMLLoader(file_path)
        elif file_ext in ["htm", "html"]:
            loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
        elif file_ext == "md":
            loader = UnstructuredMarkdownLoader(file_path)
        elif file_content_type == "application/epub+zip":
            loader = UnstructuredEPubLoader(file_path)
        elif (
            file_content_type
            == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            or file_ext in ["doc", "docx"]
        ):
            loader = Docx2txtLoader(file_path)
        elif file_content_type in [
            "application/vnd.ms-excel",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ] or file_ext in ["xls", "xlsx"]:
            loader = UnstructuredExcelLoader(file_path)
        elif file_content_type in [
            "application/vnd.ms-powerpoint",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ] or file_ext in ["ppt", "pptx"]:
            loader = UnstructuredPowerPointLoader(file_path)
        elif file_ext == "msg":
            loader = OutlookMessageLoader(file_path)
        elif file_ext in known_source_ext or (
            file_content_type and file_content_type.find("text/") >= 0
        ):
            loader = TextLoader(file_path, autodetect_encoding=True)
        else:
            loader = TextLoader(file_path, autodetect_encoding=True)
            known_type = False
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1163
1164
1165
1166

    return loader, known_type


1167
@app.post("/doc")
Timothy J. Baek's avatar
Timothy J. Baek committed
1168
def store_doc(
Timothy J. Baek's avatar
Timothy J. Baek committed
1169
    collection_name: Optional[str] = Form(None),
Timothy J. Baek's avatar
Timothy J. Baek committed
1170
    file: UploadFile = File(...),
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1171
    user=Depends(get_verified_user),
Timothy J. Baek's avatar
Timothy J. Baek committed
1172
):
1173
    # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
Timothy J. Baek's avatar
Timothy J. Baek committed
1174

1175
    log.info(f"file.content_type: {file.content_type}")
1176
    try:
1177
        unsanitized_filename = file.filename
Timothy J. Baek's avatar
Timothy J. Baek committed
1178
        filename = os.path.basename(unsanitized_filename)
1179

Timothy J. Baek's avatar
Timothy J. Baek committed
1180
        file_path = f"{UPLOAD_DIR}/{filename}"
1181

1182
        contents = file.file.read()
Timothy J. Baek's avatar
Timothy J. Baek committed
1183
        with open(file_path, "wb") as f:
1184
1185
1186
            f.write(contents)
            f.close()

Timothy J. Baek's avatar
Timothy J. Baek committed
1187
        f = open(file_path, "rb")
1188
        if collection_name is None:
Timothy J. Baek's avatar
Timothy J. Baek committed
1189
1190
1191
            collection_name = calculate_sha256(f)[:63]
        f.close()

Timothy J. Baek's avatar
Timothy J. Baek committed
1192
        loader, known_type = get_loader(filename, file.content_type, file_path)
Timothy J. Baek's avatar
Timothy J. Baek committed
1193
        data = loader.load()
Timothy J. Baek's avatar
Timothy J. Baek committed
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205

        try:
            result = store_data_in_vector_db(data, collection_name)

            if result:
                return {
                    "status": True,
                    "collection_name": collection_name,
                    "filename": filename,
                    "known_type": known_type,
                }
        except Exception as e:
Timothy J. Baek's avatar
Timothy J. Baek committed
1206
1207
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
Timothy J. Baek's avatar
Timothy J. Baek committed
1208
                detail=e,
Timothy J. Baek's avatar
Timothy J. Baek committed
1209
            )
1210
    except Exception as e:
1211
        log.exception(e)
Dave Bauman's avatar
Dave Bauman committed
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
        if "No pandoc was found" in str(e):
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
            )
        else:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=ERROR_MESSAGES.DEFAULT(e),
            )
1222
1223


Timothy J. Baek's avatar
Timothy J. Baek committed
1224
1225
class ProcessDocForm(BaseModel):
    file_id: str
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1226
    collection_name: Optional[str] = None
Timothy J. Baek's avatar
Timothy J. Baek committed
1227
1228
1229
1230
1231


@app.post("/process/doc")
def process_doc(
    form_data: ProcessDocForm,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1232
    user=Depends(get_verified_user),
Timothy J. Baek's avatar
Timothy J. Baek committed
1233
1234
1235
1236
1237
1238
):
    try:
        file = Files.get_file_by_id(form_data.file_id)
        file_path = file.meta.get("path", f"{UPLOAD_DIR}/{file.filename}")

        f = open(file_path, "rb")
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1239
1240

        collection_name = form_data.collection_name
1241
        if collection_name is None:
Timothy J. Baek's avatar
Timothy J. Baek committed
1242
1243
1244
1245
1246
1247
1248
1249
1250
            collection_name = calculate_sha256(f)[:63]
        f.close()

        loader, known_type = get_loader(
            file.filename, file.meta.get("content_type"), file_path
        )
        data = loader.load()

        try:
1251
1252
1253
1254
1255
1256
1257
1258
            result = store_data_in_vector_db(
                data,
                collection_name,
                {
                    "file_id": form_data.file_id,
                    "name": file.meta.get("name", file.filename),
                },
            )
Timothy J. Baek's avatar
Timothy J. Baek committed
1259
1260
1261
1262
1263
1264

            if result:
                return {
                    "status": True,
                    "collection_name": collection_name,
                    "known_type": known_type,
1265
                    "filename": file.meta.get("name", file.filename),
Timothy J. Baek's avatar
Timothy J. Baek committed
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
                }
        except Exception as e:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=e,
            )
    except Exception as e:
        log.exception(e)
        if "No pandoc was found" in str(e):
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
            )
        else:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=ERROR_MESSAGES.DEFAULT(e),
            )


1286
1287
1288
1289
1290
1291
1292
1293
1294
class TextRAGForm(BaseModel):
    name: str
    content: str
    collection_name: Optional[str] = None


@app.post("/text")
def store_text(
    form_data: TextRAGForm,
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1295
    user=Depends(get_verified_user),
1296
1297
1298
):

    collection_name = form_data.collection_name
1299
    if collection_name is None:
1300
1301
        collection_name = calculate_sha256_string(form_data.content)

Timothy J. Baek's avatar
Timothy J. Baek committed
1302
1303
1304
1305
1306
    result = store_text_in_vector_db(
        form_data.content,
        metadata={"name": form_data.name, "created_by": user.id},
        collection_name=collection_name,
    )
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316

    if result:
        return {"status": True, "collection_name": collection_name}
    else:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=ERROR_MESSAGES.DEFAULT(),
        )


1317
1318
@app.get("/scan")
def scan_docs_dir(user=Depends(get_admin_user)):
1319
1320
    for path in Path(DOCS_DIR).rglob("./**/*"):
        try:
1321
1322
1323
1324
1325
1326
1327
1328
1329
            if path.is_file() and not path.name.startswith("."):
                tags = extract_folders_after_data_docs(path)
                filename = path.name
                file_content_type = mimetypes.guess_type(path)

                f = open(path, "rb")
                collection_name = calculate_sha256(f)[:63]
                f.close()

Timothy J. Baek's avatar
Timothy J. Baek committed
1330
1331
1332
                loader, known_type = get_loader(
                    filename, file_content_type[0], str(path)
                )
1333
1334
                data = loader.load()

Timothy J. Baek's avatar
Timothy J. Baek committed
1335
1336
1337
1338
1339
1340
1341
                try:
                    result = store_data_in_vector_db(data, collection_name)

                    if result:
                        sanitized_filename = sanitize_filename(filename)
                        doc = Documents.get_doc_by_name(sanitized_filename)

1342
                        if doc is None:
Timothy J. Baek's avatar
Timothy J. Baek committed
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
                            doc = Documents.insert_new_doc(
                                user.id,
                                DocumentForm(
                                    **{
                                        "name": sanitized_filename,
                                        "title": filename,
                                        "collection_name": collection_name,
                                        "filename": filename,
                                        "content": (
                                            json.dumps(
                                                {
                                                    "tags": list(
                                                        map(
                                                            lambda name: {"name": name},
                                                            tags,
                                                        )
1359
                                                    )
Timothy J. Baek's avatar
Timothy J. Baek committed
1360
1361
1362
1363
1364
1365
1366
1367
1368
                                                }
                                            )
                                            if len(tags)
                                            else "{}"
                                        ),
                                    }
                                ),
                            )
                except Exception as e:
1369
                    log.exception(e)
Timothy J. Baek's avatar
Timothy J. Baek committed
1370
                    pass
1371

1372
        except Exception as e:
1373
            log.exception(e)
1374
1375
1376
1377

    return True


Timothy J. Baek's avatar
Timothy J. Baek committed
1378
@app.get("/reset/db")
1379
1380
def reset_vector_db(user=Depends(get_admin_user)):
    CHROMA_CLIENT.reset()
Timothy J. Baek's avatar
Timothy J. Baek committed
1381
1382


Timothy J. Baek's avatar
Timothy J. Baek committed
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
@app.get("/reset/uploads")
def reset_upload_dir(user=Depends(get_admin_user)) -> bool:
    folder = f"{UPLOAD_DIR}"
    try:
        # Check if the directory exists
        if os.path.exists(folder):
            # Iterate over all the files and directories in the specified directory
            for filename in os.listdir(folder):
                file_path = os.path.join(folder, filename)
                try:
                    if os.path.isfile(file_path) or os.path.islink(file_path):
                        os.unlink(file_path)  # Remove the file or link
                    elif os.path.isdir(file_path):
                        shutil.rmtree(file_path)  # Remove the directory
                except Exception as e:
                    print(f"Failed to delete {file_path}. Reason: {e}")
        else:
            print(f"The directory {folder} does not exist")
    except Exception as e:
        print(f"Failed to process the directory {folder}. Reason: {e}")

    return True


Timothy J. Baek's avatar
Timothy J. Baek committed
1407
@app.get("/reset")
1408
1409
1410
1411
def reset(user=Depends(get_admin_user)) -> bool:
    folder = f"{UPLOAD_DIR}"
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
Timothy J. Baek's avatar
Timothy J. Baek committed
1412
        try:
1413
1414
1415
1416
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
Timothy J. Baek's avatar
Timothy J. Baek committed
1417
        except Exception as e:
1418
            log.error("Failed to delete %s. Reason: %s" % (file_path, e))
Timothy J. Baek's avatar
Timothy J. Baek committed
1419

1420
1421
1422
    try:
        CHROMA_CLIENT.reset()
    except Exception as e:
1423
        log.exception(e)
1424
1425

    return True
Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1426

Timothy J. Baek's avatar
Timothy J. Baek committed
1427

1428
1429
class SafeWebBaseLoader(WebBaseLoader):
    """WebBaseLoader with enhanced error handling for URLs."""
Timothy J. Baek's avatar
Timothy J. Baek committed
1430

1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
    def lazy_load(self) -> Iterator[Document]:
        """Lazy load text from the url(s) in web_path with error handling."""
        for path in self.web_paths:
            try:
                soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
                text = soup.get_text(**self.bs_get_text_kwargs)

                # Build metadata
                metadata = {"source": path}
                if title := soup.find("title"):
                    metadata["title"] = title.get_text()
                if description := soup.find("meta", attrs={"name": "description"}):
Timothy J. Baek's avatar
Timothy J. Baek committed
1443
1444
1445
                    metadata["description"] = description.get(
                        "content", "No description found."
                    )
1446
1447
                if html := soup.find("html"):
                    metadata["language"] = html.get("lang", "No language found.")
Timothy J. Baek's avatar
Timothy J. Baek committed
1448

1449
1450
1451
1452
                yield Document(page_content=text, metadata=metadata)
            except Exception as e:
                # Log the error and continue with the next URL
                log.error(f"Error loading {path}: {e}")
Timothy J. Baek's avatar
Timothy J. Baek committed
1453
1454


Timothy J. Baek's avatar
refac  
Timothy J. Baek committed
1455
1456
1457
1458
1459
1460
1461
1462
1463
if ENV == "dev":

    @app.get("/ef")
    async def get_embeddings():
        return {"result": app.state.EMBEDDING_FUNCTION("hello world")}

    @app.get("/ef/{text}")
    async def get_embeddings_text(text: str):
        return {"result": app.state.EMBEDDING_FUNCTION(text)}