Update process_documents.py
Browse files- process_documents.py +5 -1
process_documents.py
CHANGED
@@ -35,9 +35,13 @@ def process_documents(urls):
|
|
35 |
|
36 |
def process_web(url, source_id):
|
37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
|
|
|
|
|
|
|
|
38 |
document_snippets = [
|
39 |
Document(
|
40 |
-
page_content=
|
41 |
metadata={
|
42 |
"header": data.metadata["title"],
|
43 |
"source_url": url,
|
|
|
35 |
|
36 |
def process_web(url, source_id):
|
37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
38 |
+
try:
|
39 |
+
page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
|
40 |
+
except Exception as e:
|
41 |
+
page_content = data.page_content.strip()
|
42 |
document_snippets = [
|
43 |
Document(
|
44 |
+
page_content=page_content,
|
45 |
metadata={
|
46 |
"header": data.metadata["title"],
|
47 |
"source_url": url,
|