inflaton commited on
Commit
a766494
β€’
1 Parent(s): 59fc6ec

switch from Unstructured Loader to PyPDF as its results have page nubmer

Browse files
.env.example CHANGED
@@ -18,6 +18,8 @@ HF_PIPELINE_DEVICE_TYPE=
18
 
19
  CHAT_HISTORY_ENABLED=true
20
 
 
 
21
  # if unset, default to "hkunlp/instructor-xl"
22
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
23
 
 
18
 
19
  CHAT_HISTORY_ENABLED=true
20
 
21
+ PDF_FILE_BASE_URL=
22
+
23
  # if unset, default to "hkunlp/instructor-xl"
24
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
25
 
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Chat With AI Books
3
  emoji: πŸ‘€
4
  colorFrom: indigo
5
  colorTo: blue
 
1
  ---
2
+ title: Chat with AI Books
3
  emoji: πŸ‘€
4
  colorFrom: indigo
5
  colorTo: blue
app_modules/qa_chain.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
  from queue import Queue
4
  from typing import Any, Optional
5
 
@@ -528,4 +529,14 @@ class QAChain:
528
  self.streamer.reset(q)
529
 
530
  qa = self.get_chain(tracing)
531
- return qa(inputs)
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
+ import urllib
4
  from queue import Queue
5
  from typing import Any, Optional
6
 
 
529
  self.streamer.reset(q)
530
 
531
  qa = self.get_chain(tracing)
532
+ result = qa(inputs)
533
+
534
+ base_url = os.environ.get("PDF_FILE_BASE_URL")
535
+ if base_url is not None:
536
+ documents = result["source_documents"]
537
+ for doc in documents:
538
+ source = doc.metadata["source"]
539
+ title = source.split("/")[-1]
540
+ doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
541
+
542
+ return result
data/chromadb_1024_512/chroma-collections.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19ef26d149b26a03e85a8341d84187c920cfe877e0265871e9950829f553a1ea
3
  size 557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0e4364f9a67d91e3185cc597297b8651ca02bdfddb8467767c8a71cbb89d4e
3
  size 557
data/chromadb_1024_512/chroma-embeddings.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38d53221ceafcfb1af1d4a068ba81076a998cfc4ecbc09952c02ee14353b0daa
3
- size 156890027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b050c60c5fd263355aabc3cc35e6308930cb4b8a1929e7209b6777da0782d59
3
+ size 7513430
data/chromadb_1024_512/index/{id_to_uuid_67de6665-0585-4559-85bd-e044c61f64df.pkl β†’ id_to_uuid_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a58ac77e06b03413e225e1e2fc0fb956883fe6ac387be6332942bbb4ab6ac946
3
- size 997779
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eb4fe05362f4052e3af173b0915e9758bb7bc7f9f681850e765cbde35d8783f
3
+ size 47652
data/chromadb_1024_512/index/{uuid_to_id_67de6665-0585-4559-85bd-e044c61f64df.pkl β†’ index_44a39155-bdc7-450c-8532-01db0e4b66cc.bin} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e971f597172b30a15507fdaf90438399dfd9b54c0599196c3f9c3422218bcfe6
3
- size 1166688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a26db7cd65749049856321b4aef559a0ffbef7f4286131c1bcd5f5dc4cc3849
3
+ size 4743996
data/chromadb_1024_512/index/{index_metadata_67de6665-0585-4559-85bd-e044c61f64df.pkl β†’ index_metadata_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47aef36a38d7c29f39b1b7e69e7788020b31f6c52c96a3f1bd8d13e79eeecda1
3
  size 105
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae5e0c780f18efa625dc2d0ad2d60328b51d2842cac144446196e4032e7c2c43
3
  size 105
data/chromadb_1024_512/index/{index_67de6665-0585-4559-85bd-e044c61f64df.bin β†’ uuid_to_id_44a39155-bdc7-450c-8532-01db0e4b66cc.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a74d2560e304e1c78499c16465d6135d227330f021c98bb1c83a4fc8a641eeeb
3
- size 98876604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49aab2d749c5650688e4b1b566d8773889ca59d92ea2083d04fd5882a626ecc0
3
+ size 55737
ingest.py CHANGED
@@ -3,9 +3,7 @@ import os
3
  from timeit import default_timer as timer
4
  from typing import List
5
 
6
- import torch
7
- from dotenv import load_dotenv
8
- from langchain.document_loaders.directory import DirectoryLoader
9
  from langchain.embeddings import HuggingFaceInstructEmbeddings
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain.vectorstores.chroma import Chroma
@@ -14,7 +12,7 @@ from app_modules.utils import *
14
 
15
 
16
  def load_documents(source_pdfs_path) -> List:
17
- loader = DirectoryLoader(source_pdfs_path, glob="./*.pdf", show_progress=True)
18
  documents = loader.load()
19
  return documents
20
 
 
3
  from timeit import default_timer as timer
4
  from typing import List
5
 
6
+ from langchain.document_loaders import PyPDFDirectoryLoader
 
 
7
  from langchain.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.vectorstores.chroma import Chroma
 
12
 
13
 
14
  def load_documents(source_pdfs_path) -> List:
15
+ loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
16
  documents = loader.load()
17
  return documents
18