shimizukawa commited on
Commit
c1dc2ee
1 Parent(s): b6dd5cc

add custom readthedocs loader

Browse files
Files changed (5) hide show
  1. config.py +1 -1
  2. loaders/__init__.py +3 -1
  3. loaders/rtdhtmlpage.py +77 -0
  4. requirements.txt +2 -1
  5. store.py +3 -2
config.py CHANGED
@@ -22,7 +22,7 @@ def get_index_names():
22
  keys = [
23
  k for k in [
24
  k.strip().lower()
25
- for k in os.environ["INDEX_NAMES"].split(",")
26
  ]
27
  if k
28
  ]
 
22
  keys = [
23
  k for k in [
24
  k.strip().lower()
25
+ for k in os.environ.get("INDEX_NAMES", "").split(",")
26
  ]
27
  if k
28
  ]
loaders/__init__.py CHANGED
@@ -1,9 +1,11 @@
1
  from .wikipage import WikiPageLoader
2
  from .github_issue import GithubIssueLoader
 
3
 
4
  LOADERS = {
5
  "wikipage": WikiPageLoader,
6
- "github_issue": GithubIssueLoader
 
7
  }
8
  LOADER_NAMES = tuple(LOADERS.keys())
9
 
 
1
  from .wikipage import WikiPageLoader
2
  from .github_issue import GithubIssueLoader
3
+ from .rtdhtmlpage import RTDHtmlPageLoader
4
 
5
  LOADERS = {
6
  "wikipage": WikiPageLoader,
7
+ "github_issue": GithubIssueLoader,
8
+ "rtdhtmlpage": RTDHtmlPageLoader,
9
  }
10
  LOADER_NAMES = tuple(LOADERS.keys())
11
 
loaders/rtdhtmlpage.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Iterator
4
+
5
+ from langchain.docstore.document import Document
6
+ from langchain.document_loaders import ReadTheDocsLoader
7
+
8
+
9
+ class RTDHtmlPageLoader(ReadTheDocsLoader):
10
+ """directory path for readthedocs documents
11
+
12
+ $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
13
+ $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
14
+ """
15
+ def __init__(self, index: str, inputfile: Path, *args, **kwargs):
16
+ self.index = index
17
+ kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
18
+ super().__init__(inputfile, *args, **kwargs)
19
+
20
+ def _my_clean_data(self, data: str) -> str:
21
+ from bs4 import BeautifulSoup
22
+
23
+ soup = BeautifulSoup(data, **self.bs_kwargs)
24
+
25
+ # default tags
26
+ html_tags = [
27
+ ("div", {"role": "main"}),
28
+ ("main", {"id": "main-content"}),
29
+ ]
30
+
31
+ if self.custom_html_tag is not None:
32
+ html_tags.append(self.custom_html_tag)
33
+
34
+ text = None
35
+
36
+ # reversed order. check the custom one first
37
+ for tag, attrs in html_tags[::-1]:
38
+ text = soup.find(tag, attrs)
39
+ # if found, break
40
+ if text is not None:
41
+ break
42
+
43
+ if text is not None:
44
+ title = "".join(t.text for t in text.find("h1") if t.name!="a")
45
+ text = text.get_text()
46
+ else:
47
+ text = ""
48
+ title = ""
49
+
50
+ # trim empty lines
51
+ text = "\n".join([t for t in text.split("\n") if t])
52
+
53
+ return text, title
54
+
55
+ def lazy_load(self) -> Iterator[Document]:
56
+ """Load documents."""
57
+ for p in self.file_path.rglob("*"):
58
+ if p.is_dir():
59
+ continue
60
+ with open(p, encoding=self.encoding, errors=self.errors) as f:
61
+ text, title = self._my_clean_data(f.read())
62
+
63
+ metadata = {
64
+ "title": title,
65
+ "ctime": int(datetime.now().timestamp()),
66
+ "user": "rtd",
67
+ "type": "rtd",
68
+ "url": f"https://{str(p)}",
69
+ "index": self.index,
70
+ "id": str(p),
71
+ }
72
+ # print(metadata)
73
+ yield Document(page_content=text, metadata=metadata)
74
+
75
+
76
+ def load(self) -> list[Document]:
77
+ return list(self.lazy_load())
requirements.txt CHANGED
@@ -9,4 +9,5 @@ sentence_transformers
9
  streamlit
10
  python-dateutil
11
  openai
12
- tqdm
 
 
9
  streamlit
10
  python-dateutil
11
  openai
12
+ tqdm
13
+ beautifulsoup4
store.py CHANGED
@@ -56,7 +56,7 @@ def store(texts):
56
  def get_parser():
57
  p = argparse.ArgumentParser()
58
  p.add_argument("index", type=str)
59
- p.add_argument("inputfile", metavar="INPUTFILE", type=argparse.FileType("rt"))
60
  p.add_argument("-l", "--loader", type=str, choices=LOADER_NAMES, required=True)
61
  return p
62
 
@@ -65,13 +65,14 @@ def main():
65
  """
66
  $ python store.py --loader wikipage "index" "FILE_PATH"
67
  $ python store.py -l wikipage wiki data/wiki.json
 
68
  """
69
  p = get_parser()
70
  args = p.parse_args()
71
  loader = get_loader(
72
  args.loader,
73
  index=args.index,
74
- inputfile=Path(args.inputfile.name),
75
  )
76
 
77
  docs = loader.load()
 
56
  def get_parser():
57
  p = argparse.ArgumentParser()
58
  p.add_argument("index", type=str)
59
+ p.add_argument("inputfile", metavar="INPUTFILE", type=str)
60
  p.add_argument("-l", "--loader", type=str, choices=LOADER_NAMES, required=True)
61
  return p
62
 
 
65
  """
66
  $ python store.py --loader wikipage "index" "FILE_PATH"
67
  $ python store.py -l wikipage wiki data/wiki.json
68
+ $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
69
  """
70
  p = get_parser()
71
  args = p.parse_args()
72
  loader = get_loader(
73
  args.loader,
74
  index=args.index,
75
+ inputfile=Path(args.inputfile),
76
  )
77
 
78
  docs = loader.load()