Ritvik19 commited on
Commit
0c85aeb
1 Parent(s): b0a4a7b

Update process_documents.py

Browse files
Files changed (1) hide show
  1. process_documents.py +5 -1
process_documents.py CHANGED
@@ -35,9 +35,13 @@ def process_documents(urls):
35
 
36
  def process_web(url, source_id):
37
  data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
 
 
 
 
38
  document_snippets = [
39
  Document(
40
- page_content=data.page_content,
41
  metadata={
42
  "header": data.metadata["title"],
43
  "source_url": url,
 
35
 
36
  def process_web(url, source_id):
37
  data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
38
+ try:
39
+ page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
40
+ except Exception as e:
41
+ page_content = data.page_content.strip()
42
  document_snippets = [
43
  Document(
44
+ page_content=page_content,
45
  metadata={
46
  "header": data.metadata["title"],
47
  "source_url": url,