Spaces:
Build error
Build error
heikowagner
commited on
Commit
•
98639ab
1
Parent(s):
3fc85ef
move elements
Browse files- .dockerignore +12 -0
- Dockerfile +4 -2
- app/app.py +3 -1
- app/elements.py +67 -0
- app/load_model.py +2 -2
- app/load_vectors.py +18 -1
- app/run.py +16 -5
- app/utils.py +2 -59
- docker-compose.yaml +4 -1
.dockerignore
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
./docker/zeppelin/logs/*
|
2 |
+
*.openaiapikey*
|
3 |
+
*.log
|
4 |
+
*.log.*
|
5 |
+
*__pycache__*
|
6 |
+
root
|
7 |
+
*.ipynb_checkpoints*
|
8 |
+
.vscode
|
9 |
+
/app/mymodels
|
10 |
+
/app/.cache
|
11 |
+
/app/VectorStore
|
12 |
+
*chroma-embeddings.parquet*
|
Dockerfile
CHANGED
@@ -30,8 +30,9 @@ COPY ./requirements.txt requirements.txt
|
|
30 |
RUN pip install -r requirements.txt
|
31 |
COPY ./app .
|
32 |
#RUN python load_docs.py
|
33 |
-
RUN --mount=type=secret,id=OPENAI_API_KEY \
|
34 |
-
cat /run/secrets/OPENAI_API_KEY > .openaiapikey
|
|
|
35 |
RUN mkdir /.cache
|
36 |
RUN mkdir /nltk_data
|
37 |
RUN mkdir /VectorStore
|
@@ -39,6 +40,7 @@ RUN mkdir /app/.cache
|
|
39 |
RUN ls -la
|
40 |
RUN python run.py
|
41 |
RUN chmod 777 /VectorStore
|
|
|
42 |
RUN chmod 777 /nltk_data
|
43 |
RUN chmod 777 /.cache
|
44 |
RUN chmod 777 /app/.cache
|
|
|
30 |
RUN pip install -r requirements.txt
|
31 |
COPY ./app .
|
32 |
#RUN python load_docs.py
|
33 |
+
#RUN --mount=type=secret,id=OPENAI_API_KEY \
|
34 |
+
# cat /run/secrets/OPENAI_API_KEY > .openaiapikey
|
35 |
+
RUN echo "" > .openaiapikey
|
36 |
RUN mkdir /.cache
|
37 |
RUN mkdir /nltk_data
|
38 |
RUN mkdir /VectorStore
|
|
|
40 |
RUN ls -la
|
41 |
RUN python run.py
|
42 |
RUN chmod 777 /VectorStore
|
43 |
+
RUN chmod 777 /mymodels
|
44 |
RUN chmod 777 /nltk_data
|
45 |
RUN chmod 777 /.cache
|
46 |
RUN chmod 777 /app/.cache
|
app/app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
import load_model
|
3 |
import utils as ut
|
|
|
4 |
import os
|
|
|
5 |
|
6 |
persist_directory = load_model.persist_directory
|
7 |
st.title('myRetrievalGPT')
|
@@ -11,7 +13,7 @@ st.markdown('*Let $\phi$ be a word embedding mapping $W$ → $\mathbb{R}^n$ w
|
|
11 |
|
12 |
agree = st.checkbox('Load new Documents')
|
13 |
if agree:
|
14 |
-
|
15 |
else:
|
16 |
|
17 |
import torch
|
|
|
1 |
import streamlit as st
|
2 |
import load_model
|
3 |
import utils as ut
|
4 |
+
import elements as el
|
5 |
import os
|
6 |
+
import torch
|
7 |
|
8 |
persist_directory = load_model.persist_directory
|
9 |
st.title('myRetrievalGPT')
|
|
|
13 |
|
14 |
agree = st.checkbox('Load new Documents')
|
15 |
if agree:
|
16 |
+
el.load_files()
|
17 |
else:
|
18 |
|
19 |
import torch
|
app/elements.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from chromadb.config import Settings
|
5 |
+
from load_model import load_embedding
|
6 |
+
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
7 |
+
from utils import retrieve_collections, get_chroma_client
|
8 |
+
|
9 |
+
def llm_module():
|
10 |
+
pass
|
11 |
+
|
12 |
+
def load_files():
|
13 |
+
|
14 |
+
client = get_chroma_client()
|
15 |
+
|
16 |
+
option = st.radio(
|
17 |
+
"",
|
18 |
+
options=["Add Documents", "Start new collection"],
|
19 |
+
)
|
20 |
+
|
21 |
+
if option == "Add Documents":
|
22 |
+
collections = retrieve_collections()
|
23 |
+
selected_collection = st.selectbox(
|
24 |
+
'Add to exsisting collection or create a new one',
|
25 |
+
collections )
|
26 |
+
if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
|
27 |
+
client.delete_collection(name=selected_collection["name"])
|
28 |
+
#retrieve_collections.clear()
|
29 |
+
collections = retrieve_collections()
|
30 |
+
|
31 |
+
if selected_collection:
|
32 |
+
st.write("Selected Vectorstore:", selected_collection)
|
33 |
+
option = st.radio(
|
34 |
+
"",
|
35 |
+
options=["Upload Files from Local", "Upload Files from Web"],
|
36 |
+
)
|
37 |
+
if option == "Upload Files from Local":
|
38 |
+
st.write('Source Documents:')
|
39 |
+
uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
|
40 |
+
chunk_size = st.text_area('chunk Size:', 1000)
|
41 |
+
|
42 |
+
if st.button('Upload'):
|
43 |
+
docs = load_from_file(uploaded_files)
|
44 |
+
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
45 |
+
vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
46 |
+
st.write("Upload succesful")
|
47 |
+
else:
|
48 |
+
st.write('Urls of Source Documents (Comma separated):')
|
49 |
+
urls = chunk_size = st.text_area('Urls:', '')
|
50 |
+
chunk_size = st.text_area('chunk Size:', 1000)
|
51 |
+
urls = urls.replace(",", "" ).replace('"', "" ).split(',')
|
52 |
+
|
53 |
+
if st.button('Upload'):
|
54 |
+
docs = load_from_web(urls)
|
55 |
+
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
56 |
+
vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
57 |
+
st.write("Upload succesful")
|
58 |
+
else:
|
59 |
+
collection = st.text_area('Name of your new collection:', '')
|
60 |
+
model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
|
61 |
+
if st.button('Create'):
|
62 |
+
if len(collection)>3:
|
63 |
+
ef = load_embedding(model_name)
|
64 |
+
metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
|
65 |
+
client.create_collection(collection, embedding_function=ef, metadata=metadata)
|
66 |
+
# retrieve_collections.clear()
|
67 |
+
st.write("Collection " +collection+" succesfully created.")
|
app/load_model.py
CHANGED
@@ -82,8 +82,8 @@ def load_gpu_model(used_model):
|
|
82 |
return llm
|
83 |
|
84 |
#@st.cache_resource
|
85 |
-
def load_openai_model():
|
86 |
-
return OpenAI(temperature=
|
87 |
|
88 |
@st.cache_resource
|
89 |
def load_openai_embedding():
|
|
|
82 |
return llm
|
83 |
|
84 |
#@st.cache_resource
|
85 |
+
def load_openai_model(temperature=0.9):
|
86 |
+
return OpenAI(temperature=temperature)
|
87 |
|
88 |
@st.cache_resource
|
89 |
def load_openai_embedding():
|
app/load_vectors.py
CHANGED
@@ -101,4 +101,21 @@ def load_from_web(urls, cache=True):
|
|
101 |
def load_and_split(docs, chunk_size=700):
|
102 |
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
|
103 |
sub_docs = text_splitter.split_documents(docs)
|
104 |
-
return sub_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
def load_and_split(docs, chunk_size=700):
|
102 |
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
|
103 |
sub_docs = text_splitter.split_documents(docs)
|
104 |
+
return sub_docs
|
105 |
+
|
106 |
+
def metadata_generator(doc, llm,max_token=4000):
|
107 |
+
#query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
|
108 |
+
query = f"""
|
109 |
+
Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
|
110 |
+
Output format:
|
111 |
+
{{
|
112 |
+
"DOCUMENT TYPE": "",
|
113 |
+
"SUMMARY": [],
|
114 |
+
"REASONING": ""
|
115 |
+
}}
|
116 |
+
|
117 |
+
Input document:
|
118 |
+
{doc.page_content[1:max_token]}
|
119 |
+
Output:
|
120 |
+
"""
|
121 |
+
return llm(query)
|
app/run.py
CHANGED
@@ -5,7 +5,7 @@ import pathlib
|
|
5 |
|
6 |
from load_model import load_embedding
|
7 |
from utils import get_chroma_client
|
8 |
-
from load_vectors import load_from_web, create_and_add, load_and_split
|
9 |
|
10 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
11 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
@@ -21,12 +21,12 @@ llm= load_model.load_openai_model()
|
|
21 |
client = get_chroma_client()
|
22 |
client.reset()
|
23 |
ef = load_embedding("hkunlp/instructor-large")
|
24 |
-
collection_name="
|
25 |
-
metadata= {"loaded_docs":[], "Subject":"
|
26 |
selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
|
27 |
|
28 |
docs_tarifs= [
|
29 |
-
"https://
|
30 |
]
|
31 |
|
32 |
# %%
|
@@ -40,7 +40,18 @@ docs = load_from_web(docs_tarifs)
|
|
40 |
sub_docs = load_and_split(docs, chunk_size=1000)
|
41 |
create_and_add(collection_name, sub_docs, model_name, metadata)
|
42 |
|
|
|
|
|
43 |
# %%
|
44 |
-
chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
|
45 |
#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
|
46 |
#print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
from load_model import load_embedding
|
7 |
from utils import get_chroma_client
|
8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split, metadata_generator
|
9 |
|
10 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
11 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
|
|
21 |
client = get_chroma_client()
|
22 |
client.reset()
|
23 |
ef = load_embedding("hkunlp/instructor-large")
|
24 |
+
collection_name="papers"
|
25 |
+
metadata= {"loaded_docs":[], "Subject":"Heikos Papers", "model_name": ef.model_name}
|
26 |
selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
|
27 |
|
28 |
docs_tarifs= [
|
29 |
+
"https://edoc.hu-berlin.de/bitstream/handle/18452/5294/33.pdf",
|
30 |
]
|
31 |
|
32 |
# %%
|
|
|
40 |
sub_docs = load_and_split(docs, chunk_size=1000)
|
41 |
create_and_add(collection_name, sub_docs, model_name, metadata)
|
42 |
|
43 |
+
|
44 |
+
|
45 |
# %%
|
46 |
+
#chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name, metadata=metadata)
|
47 |
#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
|
48 |
#print(result)
|
49 |
+
#llm= load_model.load_openai_model(temperature=0.1)
|
50 |
+
|
51 |
+
#llm= load_model.load_cpu_model()
|
52 |
+
|
53 |
+
#meta= metadata_generator(docs[0], llm)
|
54 |
+
# %%
|
55 |
+
#print(meta)
|
56 |
+
|
57 |
+
# %%
|
app/utils.py
CHANGED
@@ -27,66 +27,9 @@ def get_chroma_client():
|
|
27 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
28 |
persist_directory=persist_directory
|
29 |
))
|
30 |
-
|
31 |
def retrieve_collections():
|
32 |
client = get_chroma_client()
|
33 |
all_collections = client.list_collections()
|
34 |
collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
|
35 |
-
return collections
|
36 |
-
|
37 |
-
def load_files():
|
38 |
-
|
39 |
-
client = get_chroma_client()
|
40 |
-
|
41 |
-
option = st.radio(
|
42 |
-
"",
|
43 |
-
options=["Add Documents", "Start new collection"],
|
44 |
-
)
|
45 |
-
|
46 |
-
if option == "Add Documents":
|
47 |
-
collections = retrieve_collections()
|
48 |
-
selected_collection = st.selectbox(
|
49 |
-
'Add to exsisting collection or create a new one',
|
50 |
-
collections )
|
51 |
-
if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
|
52 |
-
client.delete_collection(name=selected_collection["name"])
|
53 |
-
#retrieve_collections.clear()
|
54 |
-
collections = retrieve_collections()
|
55 |
-
|
56 |
-
if selected_collection:
|
57 |
-
st.write("Selected Vectorstore:", selected_collection)
|
58 |
-
option = st.radio(
|
59 |
-
"",
|
60 |
-
options=["Upload Files from Local", "Upload Files from Web"],
|
61 |
-
)
|
62 |
-
if option == "Upload Files from Local":
|
63 |
-
st.write('Source Documents:')
|
64 |
-
uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
|
65 |
-
chunk_size = st.text_area('chunk Size:', 1000)
|
66 |
-
|
67 |
-
if st.button('Upload'):
|
68 |
-
docs = load_from_file(uploaded_files)
|
69 |
-
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
70 |
-
vec1 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
71 |
-
st.write("Upload succesful")
|
72 |
-
else:
|
73 |
-
st.write('Urls of Source Documents (Comma separated):')
|
74 |
-
urls = chunk_size = st.text_area('Urls:', '')
|
75 |
-
chunk_size = st.text_area('chunk Size:', 1000)
|
76 |
-
urls = urls.replace(",", "" ).replace('"', "" ).split(',')
|
77 |
-
|
78 |
-
if st.button('Upload'):
|
79 |
-
docs = load_from_web(urls)
|
80 |
-
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
81 |
-
vec2 = create_and_add(selected_collection["name"], sub_docs, selected_collection['model_name'], selected_collection['metadata'])
|
82 |
-
st.write("Upload succesful")
|
83 |
-
else:
|
84 |
-
collection = st.text_area('Name of your new collection:', '')
|
85 |
-
model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
|
86 |
-
if st.button('Create'):
|
87 |
-
if len(collection)>3:
|
88 |
-
ef = load_embedding(model_name)
|
89 |
-
metadata= {"loaded_docs":[], "Subject":"Terms Example", "model_name": ef.model_name}
|
90 |
-
client.create_collection(collection, embedding_function=ef, metadata=metadata)
|
91 |
-
# retrieve_collections.clear()
|
92 |
-
st.write("Collection " +collection+" succesfully created.")
|
|
|
27 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
28 |
persist_directory=persist_directory
|
29 |
))
|
30 |
+
@st.cache_data
|
31 |
def retrieve_collections():
|
32 |
client = get_chroma_client()
|
33 |
all_collections = client.list_collections()
|
34 |
collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name'], "metadata": collection.metadata} for collection in all_collections] )
|
35 |
+
return collections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker-compose.yaml
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
version: "3.9"
|
2 |
services:
|
3 |
streamlit_app:
|
|
|
4 |
build: .
|
5 |
tty: true
|
6 |
ports:
|
@@ -11,11 +12,13 @@ services:
|
|
11 |
devices:
|
12 |
- capabilities: [gpu]
|
13 |
dev_app:
|
14 |
-
image:
|
15 |
tty: true
|
16 |
volumes:
|
17 |
- ./app:/app
|
18 |
- ./root:/root
|
|
|
|
|
19 |
deploy:
|
20 |
resources:
|
21 |
reservations:
|
|
|
1 |
version: "3.9"
|
2 |
services:
|
3 |
streamlit_app:
|
4 |
+
image: myretrievalgpt
|
5 |
build: .
|
6 |
tty: true
|
7 |
ports:
|
|
|
12 |
devices:
|
13 |
- capabilities: [gpu]
|
14 |
dev_app:
|
15 |
+
image: myretrievalgpt
|
16 |
tty: true
|
17 |
volumes:
|
18 |
- ./app:/app
|
19 |
- ./root:/root
|
20 |
+
depends_on:
|
21 |
+
- streamlit_app
|
22 |
deploy:
|
23 |
resources:
|
24 |
reservations:
|