Spaces:
No application file
No application file
File size: 9,323 Bytes
d137e33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import re
# import string
import streamlit as st
import logging
from config import Settings
from helper import load_corpus, load_searchers, filter_corpus, filter_years
st.set_page_config(page_title="Variable Search", page_icon="π")
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
sidebar_description = """
**This website saves the text that you write in the search input field. This data is used to improve the search engine.**
__Info__:
This site allows you to search for survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.
Using language models, the input is semantically compared against the question text of survey items.
Ideally, the results should contain the same or similar semantic information as your input.
You can filter the search results by year or country of the survey.
__How to use__:
1. Select a pre-defined input or enter a search query in the search input field.
2. Select one or more countries from the list under "Geography" or leave it empty.
3. Select a study group or leave it empty.
4. Select the range of years to be included in the search results.
5. Click on the search button.
6. Evaluate the results by clicking on "Show X more survey item(s)" to expand the list of results that contain an identical question.
__NOTE__:
__Longitudal Studies__: Variables that are grouped together often originate from longitudal studies (i.e., repeated measures over long periods of time).
__Concept Search__: While the search system will perform best for finding texts that are semantically-similar to the input, you may also try more abstract inputs such as concepts (e.g., "financial literacy"). While the system is not specifically designed to retrieve variables that are related to concepts, language models (having seen a large part of the internet) may map concepts to texts similar to the variables that are used to operationalize and measure them.
"""
st.sidebar.markdown(sidebar_description)
st.title("Variable Search")
st.markdown(f"Search across survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.<br>In total, you can search for over 80,000 items.", unsafe_allow_html=True)
@st.cache_data
def prepare_data(corpus_path, langs, pattern):
return load_corpus(corpus_path, langs, pattern)
@st.cache_resource
def prepare_models(index_name, model_name_or_path):
return load_searchers(index_name, model_name_or_path)
def prepare(settings, langs, pattern):
langs = sorted(settings.languages.split(','))
# logging.info("Preparing data...")
df = prepare_data(settings.corpus_path, langs, pattern)
# logging.info("Done.")
# logging.info("Preparing models...")
hsearcher = prepare_models(settings.index_name, settings.model_name_or_path)
# logging.info("Done.")
return df, hsearcher
try:
st.info("Please note, the **text** that you write into the text box **is saved** to improve the search engine.")
settings = Settings()
query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')..."]+settings.predefined_inputs, key="pre-query")
# query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')...", "Do you have a job?", "Are you happy with the healthcare system?"])
col1, col2 = st.columns([9,1])
if query == "Another query (via 'Search input')...":
with col1:
# query = st.selectbox("Pre-defined Queries:", ["Other query", "Do you have a job?", "Are you happy with the healthcare system?"])
query = st.text_input(label="Search input:", placeholder="Do you have a job?", key="query")
with col2:
st.write('#')
button_clicked = st.button("π")
else:
button_clicked = False
langs = sorted(settings.languages.split(','))
pattern = re.compile(r'[\W_]+')
corpus_df, hsearcher = prepare(settings, langs, pattern)
all_countries = sorted(list(set([c for cs in corpus_df["countries"].tolist() for c in cs if c and "_" not in c])))
countries = st.multiselect("Geography:", all_countries, key="countries")
if countries:
corpus_df = filter_corpus(corpus_df, countries, column="countries")
all_study_groups = sorted(list(set(corpus_df["title"].tolist())))
study_groups = st.multiselect("Study Group:", all_study_groups, key="study_groups")
if study_groups:
corpus_df = filter_corpus(corpus_df, study_groups, column="title", row_type=str)
unique_years = list(set([int(x) for x in corpus_df["date"].tolist() if isinstance(x, str) or isinstance(x, int)]))
if unique_years:
min_year, max_year = min(unique_years), max(unique_years)
if min_year < max_year:
year = st.slider("Publication Year:", min_year, max_year, (min_year, max_year), 1, key="year")
corpus_df = filter_years(corpus_df, year)
else:
year = min_year
st.markdown(f"Year: {min_year}")
st.markdown("---")
else:
year = None
corpus_groups = corpus_df.groupby(by='alpha_sentence')
try:
if (query or button_clicked) and query != "":
logging.info(f"Query: '{query}'")
logging.info(f"Geography: {countries}")
logging.info(f"Min/max Years: {year}")
with st.spinner("Searching..."):
hits = hsearcher.search(query, alpha=settings.alpha, k0=settings.top_k, k=settings.top_k, normalization=settings.normalization, weight_on_dense=settings.weight_on_dense)
result_sentences = []
for hit in hits:
_id = hit.docid
if _id in corpus_df.index:
result_sentence = corpus_df.loc[_id]["sentence"]
result_sentence = re.sub(pattern, '', result_sentence).lower()
if result_sentence not in result_sentences:
result_sentences.append(result_sentence)
st.write(f"<i>Showing the top {len(result_sentences)} result(s) out of {len(corpus_groups.groups)} question(s).</i>", unsafe_allow_html=True)
st.write("---")
# ogroups = sorted(corpus_groups.groups.items(), key=lambda x: x[1][0])
for j,sentence in enumerate(result_sentences):
if sentence in corpus_groups.groups:
group = corpus_groups.get_group(sentence)
osentence = group.iloc[0].get('sentence', '')
st.markdown(f'Question: {osentence}', unsafe_allow_html=True)
expander_text = f'Show {group.shape[0]} grouped survey items.' if group.shape[0] > 1 else f'Show {group.shape[0]} grouped survey item.'
modal = st.expander(expander_text)
for i in range(group.shape[0]):
row = group.iloc[i]
rid = row.get('id', '')
rlabel = row.get('label', '')
rsq = row.get('sub-question', '')
ritem = row.get('item_category', '')
rtitle = row.get('title', '')
if rtitle and rid:
rtitle = f'<a href="https://search.gesis.org/research_data/{rid.split("_")[0]}">{rtitle}</a>'
rdate = row.get('date', '') # TODO: what is this date?
rcountries = row.get('countries', '')
rqt1 = row.get('question_type1', '')
rqt2 = row.get('question_type2', '')
modal.markdown(f'<a href="https://search.gesis.org/variables/exploredata-{rid}">{rid}</a>\
<br>Label: {rlabel}\
<br>Sub-Question: {rsq}\
<br>Item: {ritem}\
<br>Research Data: {rtitle}\
<br>Study Date: {rdate}\
<br>Geography: {rcountries}\
<br>Question Type 1: {rqt1}\
<br>Question Type 2: {rqt2}',
unsafe_allow_html=True
)
if i+1 < group.shape[0] > 1:
modal.markdown('---')
if j+1 < len(result_sentences) > 1:
st.markdown('---')
else:
logging.debug(f"Sentence is not in groups: {sentence}")
except:
st.error("Something went wrong. Please try again with a different input.")
logging.warning(f'An error occurred for the query: {query}')
except:
st.error("Something went wrong. Please try again later.")
logging.warning(f'The app crashed.')
|