Ritvik19 commited on
Commit
5d2b3d1
1 Parent(s): 0c85aeb

Update process_documents.py

Browse files
Files changed (1) hide show
  1. process_documents.py +3 -1
process_documents.py CHANGED
@@ -36,14 +36,16 @@ def process_documents(urls):
36
  def process_web(url, source_id):
37
  data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
38
  try:
 
39
  page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
40
  except Exception as e:
 
41
  page_content = data.page_content.strip()
42
  document_snippets = [
43
  Document(
44
  page_content=page_content,
45
  metadata={
46
- "header": data.metadata["title"],
47
  "source_url": url,
48
  "source_type": "web",
49
  "chunk_id": source_id,
 
36
  def process_web(url, source_id):
37
  data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
38
  try:
39
+ header = re.search(r"Title: (.*)?", data.page_content).group(1)
40
  page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
41
  except Exception as e:
42
+ header = ""
43
  page_content = data.page_content.strip()
44
  document_snippets = [
45
  Document(
46
  page_content=page_content,
47
  metadata={
48
+ "header": header,
49
  "source_url": url,
50
  "source_type": "web",
51
  "chunk_id": source_id,