Unverified Commit 3bec60b8 authored by Que Nguyen's avatar Que Nguyen Committed by GitHub
Browse files

Fixed the issue where a single URL error disrupts the data loading process in Web Search mode

To address the unresolved issue in the LangChain library where a single URL error disrupts the data loading process, the lazy_load method in the WebBaseLoader class has been modified. The enhanced method now handles exceptions appropriately, logging errors and continuing with the remaining URLs.
parent bf1936de
...@@ -12,9 +12,10 @@ import os, shutil, logging, re ...@@ -12,9 +12,10 @@ import os, shutil, logging, re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import List, Union, Sequence from typing import List, Union, Sequence, Iterator, Any
from chromadb.utils.batch_utils import create_batches from chromadb.utils.batch_utils import create_batches
from langchain_core.documents import Document
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
WebBaseLoader, WebBaseLoader,
...@@ -701,7 +702,7 @@ def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True): ...@@ -701,7 +702,7 @@ def get_web_loader(url: Union[str, Sequence[str]], verify_ssl: bool = True):
# Check if the URL is valid # Check if the URL is valid
if not validate_url(url): if not validate_url(url):
raise ValueError(ERROR_MESSAGES.INVALID_URL) raise ValueError(ERROR_MESSAGES.INVALID_URL)
return WebBaseLoader( return SafeWebBaseLoader(
url, url,
verify_ssl=verify_ssl, verify_ssl=verify_ssl,
requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS, requests_per_second=RAG_WEB_SEARCH_CONCURRENT_REQUESTS,
...@@ -1237,6 +1238,28 @@ def reset(user=Depends(get_admin_user)) -> bool: ...@@ -1237,6 +1238,28 @@ def reset(user=Depends(get_admin_user)) -> bool:
return True return True
class SafeWebBaseLoader(WebBaseLoader):
"""WebBaseLoader with enhanced error handling for URLs."""
def lazy_load(self) -> Iterator[Document]:
"""Lazy load text from the url(s) in web_path with error handling."""
for path in self.web_paths:
try:
soup = self._scrape(path, bs_kwargs=self.bs_kwargs)
text = soup.get_text(**self.bs_get_text_kwargs)
# Build metadata
metadata = {"source": path}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
yield Document(page_content=text, metadata=metadata)
except Exception as e:
# Log the error and continue with the next URL
log.error(f"Error loading {path}: {e}")
if ENV == "dev": if ENV == "dev":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment