v2.1 added RAG summary by group
Browse files- .DS_Store +0 -0
- .gitignore +1 -0
- app.py +29 -29
- appStore/__pycache__/__init__.cpython-310.pyc +0 -0
- appStore/__pycache__/doc_processing.cpython-310.pyc +0 -0
- appStore/__pycache__/rag.cpython-310.pyc +0 -0
- appStore/__pycache__/target.cpython-310.pyc +0 -0
- appStore/__pycache__/vulnerability_analysis.cpython-310.pyc +0 -0
- appStore/rag.py +86 -0
- appStore/target.py +35 -1
- requirements.txt +4 -1
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/config.cpython-310.pyc +0 -0
- utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
- utils/__pycache__/target_classifier.cpython-310.pyc +0 -0
- utils/__pycache__/uploadAndExample.cpython-310.pyc +0 -0
- utils/__pycache__/vulnerability_classifier.cpython-310.pyc +0 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
civ_v2/
|
app.py
CHANGED
@@ -2,36 +2,36 @@ import streamlit as st
|
|
2 |
import os
|
3 |
import pkg_resources
|
4 |
|
5 |
-
# Using this wacky hack to get around the massively ridicolous managed env loading order
|
6 |
-
def is_installed(package_name, version):
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
# shifted from below - this must be the first streamlit call; otherwise: problems
|
14 |
-
st.set_page_config(page_title = 'Vulnerability Analysis',
|
15 |
-
|
16 |
-
|
17 |
-
@st.cache_resource # cache the function so it's not called every time app.py is triggered
|
18 |
-
def install_packages():
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
# install packages if necessary
|
34 |
-
install_packages()
|
35 |
|
36 |
import appStore.vulnerability_analysis as vulnerability_analysis
|
37 |
import appStore.target as target_analysis
|
@@ -41,8 +41,8 @@ from utils.vulnerability_classifier import label_dict
|
|
41 |
import pandas as pd
|
42 |
import plotly.express as px
|
43 |
|
44 |
-
|
45 |
-
|
46 |
|
47 |
with st.sidebar:
|
48 |
# upload and example doc
|
@@ -54,7 +54,7 @@ with st.sidebar:
|
|
54 |
add_upload(choice)
|
55 |
|
56 |
with st.container():
|
57 |
-
st.markdown("<h2 style='text-align: center;
|
58 |
st.write(' ')
|
59 |
|
60 |
with st.expander("ℹ️ - About this app", expanded=False):
|
|
|
2 |
import os
|
3 |
import pkg_resources
|
4 |
|
5 |
+
# # Using this wacky hack to get around the massively ridicolous managed env loading order
|
6 |
+
# def is_installed(package_name, version):
|
7 |
+
# try:
|
8 |
+
# pkg = pkg_resources.get_distribution(package_name)
|
9 |
+
# return pkg.version == version
|
10 |
+
# except pkg_resources.DistributionNotFound:
|
11 |
+
# return False
|
12 |
+
|
13 |
+
# # shifted from below - this must be the first streamlit call; otherwise: problems
|
14 |
+
# st.set_page_config(page_title = 'Vulnerability Analysis',
|
15 |
+
# initial_sidebar_state='expanded', layout="wide")
|
16 |
+
|
17 |
+
# @st.cache_resource # cache the function so it's not called every time app.py is triggered
|
18 |
+
# def install_packages():
|
19 |
+
# install_commands = []
|
20 |
+
|
21 |
+
# if not is_installed("spaces", "0.12.0"):
|
22 |
+
# install_commands.append("pip install spaces==0.17.0")
|
23 |
|
24 |
+
# if not is_installed("pydantic", "1.8.2"):
|
25 |
+
# install_commands.append("pip install pydantic==1.8.2")
|
26 |
|
27 |
+
# if not is_installed("typer", "0.4.0"):
|
28 |
+
# install_commands.append("pip install typer==0.4.0")
|
29 |
|
30 |
+
# if install_commands:
|
31 |
+
# os.system(" && ".join(install_commands))
|
32 |
|
33 |
+
# # install packages if necessary
|
34 |
+
# install_packages()
|
35 |
|
36 |
import appStore.vulnerability_analysis as vulnerability_analysis
|
37 |
import appStore.target as target_analysis
|
|
|
41 |
import pandas as pd
|
42 |
import plotly.express as px
|
43 |
|
44 |
+
st.set_page_config(page_title = 'Vulnerability Analysis',
|
45 |
+
initial_sidebar_state='expanded', layout="wide")
|
46 |
|
47 |
with st.sidebar:
|
48 |
# upload and example doc
|
|
|
54 |
add_upload(choice)
|
55 |
|
56 |
with st.container():
|
57 |
+
st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis 2.0 </h2>", unsafe_allow_html=True)
|
58 |
st.write(' ')
|
59 |
|
60 |
with st.expander("ℹ️ - About this app", expanded=False):
|
appStore/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (154 Bytes). View file
|
|
appStore/__pycache__/doc_processing.cpython-310.pyc
ADDED
Binary file (3.18 kB). View file
|
|
appStore/__pycache__/rag.cpython-310.pyc
ADDED
Binary file (1.81 kB). View file
|
|
appStore/__pycache__/target.cpython-310.pyc
ADDED
Binary file (2.8 kB). View file
|
|
appStore/__pycache__/vulnerability_analysis.cpython-310.pyc
ADDED
Binary file (4.78 kB). View file
|
|
appStore/rag.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# import json
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import openai
|
6 |
+
from haystack.schema import Document
|
7 |
+
import streamlit as st
|
8 |
+
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
9 |
+
|
10 |
+
|
11 |
+
# Get openai API key
|
12 |
+
openai.api_key = os.environ["OPENAI_API_KEY"]
|
13 |
+
model_select = "gpt-3.5-turbo-1106"
|
14 |
+
|
15 |
+
|
16 |
+
# define a special function for putting the prompt together (as we can't use haystack)
|
17 |
+
def get_prompt(context):
|
18 |
+
base_prompt="Summarize the following context efficiently in bullet points, the less the better. \
|
19 |
+
Summarize only activities that address the vulnerability of the given context to climate change. \
|
20 |
+
Formatting example: \
|
21 |
+
- Collect and utilize gender-disaggregated data to inform and improve climate change adaptation efforts. \
|
22 |
+
- Prioritize gender sensitivity in adaptation options, ensuring participation and benefits for women, who are more vulnerable to climate impacts. \
|
23 |
+
"
|
24 |
+
|
25 |
+
# Add the meta data for references
|
26 |
+
# context = ' - '.join([d.content for d in docs])
|
27 |
+
prompt = base_prompt+"; Context: "+context+"; Answer:"
|
28 |
+
|
29 |
+
return prompt
|
30 |
+
|
31 |
+
|
32 |
+
# # convert df rows to Document object so we can feed it into the summarizer easily
|
33 |
+
# def get_document(df):
|
34 |
+
# # we take a list of each extract
|
35 |
+
# ls_dict = []
|
36 |
+
# for index, row in df.iterrows():
|
37 |
+
# # Create a Document object for each row (we only need the text)
|
38 |
+
# doc = Document(
|
39 |
+
# row['text'],
|
40 |
+
# meta={
|
41 |
+
# 'label': row['Vulnerability Label']}
|
42 |
+
# )
|
43 |
+
# # Append the Document object to the documents list
|
44 |
+
# ls_dict.append(doc)
|
45 |
+
|
46 |
+
# return ls_dict
|
47 |
+
|
48 |
+
|
49 |
+
# exception handling for issuing multiple API calls to openai (exponential backoff)
|
50 |
+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
51 |
+
def completion_with_backoff(**kwargs):
|
52 |
+
return openai.ChatCompletion.create(**kwargs)
|
53 |
+
|
54 |
+
|
55 |
+
# construct RAG query, send to openai and process response
|
56 |
+
def run_query(df):
|
57 |
+
docs = df
|
58 |
+
|
59 |
+
'''
|
60 |
+
For non-streamed completion, enable the following 2 lines and comment out the code below
|
61 |
+
'''
|
62 |
+
# res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
|
63 |
+
# result = res.choices[0].message.content
|
64 |
+
|
65 |
+
# instantiate ChatCompletion as a generator object (stream is set to True)
|
66 |
+
response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}], stream=True)
|
67 |
+
# iterate through the streamed output
|
68 |
+
report = []
|
69 |
+
res_box = st.empty()
|
70 |
+
for chunk in response:
|
71 |
+
# extract the object containing the text (totally different structure when streaming)
|
72 |
+
chunk_message = chunk['choices'][0]['delta']
|
73 |
+
# test to make sure there is text in the object (some don't have)
|
74 |
+
if 'content' in chunk_message:
|
75 |
+
report.append(chunk_message.content) # extract the message
|
76 |
+
# add the latest text and merge it with all previous
|
77 |
+
result = "".join(report).strip()
|
78 |
+
# res_box.success(result) # output to response text box
|
79 |
+
res_box.success(result)
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
appStore/target.py
CHANGED
@@ -17,6 +17,7 @@ from io import BytesIO
|
|
17 |
import xlsxwriter
|
18 |
import plotly.express as px
|
19 |
from utils.target_classifier import label_dict
|
|
|
20 |
|
21 |
# Declare all the necessary variables
|
22 |
classifier_identifier = 'target'
|
@@ -82,7 +83,40 @@ def app():
|
|
82 |
|
83 |
def target_display():
|
84 |
|
|
|
|
|
85 |
# Assign dataframe a name
|
86 |
df = st.session_state['key2']
|
87 |
-
|
88 |
st.write(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
import xlsxwriter
|
18 |
import plotly.express as px
|
19 |
from utils.target_classifier import label_dict
|
20 |
+
from appStore.rag import run_query
|
21 |
|
22 |
# Declare all the necessary variables
|
23 |
classifier_identifier = 'target'
|
|
|
83 |
|
84 |
def target_display():
|
85 |
|
86 |
+
### TABLE Output ###
|
87 |
+
|
88 |
# Assign dataframe a name
|
89 |
df = st.session_state['key2']
|
|
|
90 |
st.write(df)
|
91 |
+
|
92 |
+
### RAG Output by group ##
|
93 |
+
|
94 |
+
# Expand the DataFrame
|
95 |
+
df_expand = df.explode('Vulnerability Label')
|
96 |
+
# Group by 'Vulnerability Label' and concatenate 'text'
|
97 |
+
df_agg = df_expand.groupby('Vulnerability Label')['text'].agg('; '.join).reset_index()
|
98 |
+
|
99 |
+
st.markdown("----")
|
100 |
+
st.markdown('**DOCUMENT FINDINGS SUMMARY BY VULNERABILITY LABEL:**')
|
101 |
+
|
102 |
+
# construct RAG query for each label, send to openai and process response
|
103 |
+
for i in range(0,len(df_agg)):
|
104 |
+
st.write(df_agg['Vulnerability Label'].iloc[i])
|
105 |
+
run_query(df_agg['text'].iloc[i])
|
106 |
+
# st.write(df_agg['text'].iloc[i])
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
|
requirements.txt
CHANGED
@@ -19,4 +19,7 @@ altair==4.0
|
|
19 |
streamlit-aggrid
|
20 |
python-docx
|
21 |
setfit
|
22 |
-
plotly.express
|
|
|
|
|
|
|
|
19 |
streamlit-aggrid
|
20 |
python-docx
|
21 |
setfit
|
22 |
+
plotly.express
|
23 |
+
openai==0.27.9
|
24 |
+
pydantic==1.8.2
|
25 |
+
scikit-learn==1.0.2
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (151 Bytes). View file
|
|
utils/__pycache__/config.cpython-310.pyc
ADDED
Binary file (1.1 kB). View file
|
|
utils/__pycache__/preprocessing.cpython-310.pyc
ADDED
Binary file (9.07 kB). View file
|
|
utils/__pycache__/target_classifier.cpython-310.pyc
ADDED
Binary file (3.6 kB). View file
|
|
utils/__pycache__/uploadAndExample.cpython-310.pyc
ADDED
Binary file (1.22 kB). View file
|
|
utils/__pycache__/vulnerability_classifier.cpython-310.pyc
ADDED
Binary file (4.39 kB). View file
|
|