e-tornike commited on
Commit
d137e33
1 Parent(s): e86164b

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ index filter=lfs diff=lfs merge=lfs -text
37
+ *.tsv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ # import string
3
+ import streamlit as st
4
+ import logging
5
+
6
+ from config import Settings
7
+ from helper import load_corpus, load_searchers, filter_corpus, filter_years
8
+
9
+ st.set_page_config(page_title="Variable Search", page_icon="🔎")
10
+
11
+ hide_streamlit_style = """
12
+ <style>
13
+ #MainMenu {visibility: hidden;}
14
+ footer {visibility: hidden;}
15
+ </style>
16
+ """
17
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
18
+
19
+
20
+ sidebar_description = """
21
+ **This website saves the text that you write in the search input field. This data is used to improve the search engine.**
22
+
23
+ __Info__:
24
+
25
+ This site allows you to search for survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.
26
+ Using language models, the input is semantically compared against the question text of survey items.
27
+ Ideally, the results should contain the same or similar semantic information as your input.
28
+ You can filter the search results by year or country of the survey.
29
+
30
+ __How to use__:
31
+
32
+ 1. Select a pre-defined input or enter a search query in the search input field.
33
+ 2. Select one or more countries from the list under "Geography" or leave it empty.
34
+ 3. Select a study group or leave it empty.
35
+ 4. Select the range of years to be included in the search results.
36
+ 5. Click on the search button.
37
+ 6. Evaluate the results by clicking on "Show X more survey item(s)" to expand the list of results that contain an identical question.
38
+
39
+ __NOTE__:
40
+
41
+ __Longitudal Studies__: Variables that are grouped together often originate from longitudal studies (i.e., repeated measures over long periods of time).
42
+
43
+ __Concept Search__: While the search system will perform best for finding texts that are semantically-similar to the input, you may also try more abstract inputs such as concepts (e.g., "financial literacy"). While the system is not specifically designed to retrieve variables that are related to concepts, language models (having seen a large part of the internet) may map concepts to texts similar to the variables that are used to operationalize and measure them.
44
+ """
45
+ st.sidebar.markdown(sidebar_description)
46
+
47
+
48
+ st.title("Variable Search")
49
+ st.markdown(f"Search across survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.<br>In total, you can search for over 80,000 items.", unsafe_allow_html=True)
50
+
51
+
52
+ @st.cache_data
53
+ def prepare_data(corpus_path, langs, pattern):
54
+ return load_corpus(corpus_path, langs, pattern)
55
+
56
+
57
+ @st.cache_resource
58
+ def prepare_models(index_name, model_name_or_path):
59
+ return load_searchers(index_name, model_name_or_path)
60
+
61
+
62
+ def prepare(settings, langs, pattern):
63
+ langs = sorted(settings.languages.split(','))
64
+ # logging.info("Preparing data...")
65
+ df = prepare_data(settings.corpus_path, langs, pattern)
66
+ # logging.info("Done.")
67
+ # logging.info("Preparing models...")
68
+ hsearcher = prepare_models(settings.index_name, settings.model_name_or_path)
69
+ # logging.info("Done.")
70
+ return df, hsearcher
71
+
72
+ try:
73
+ st.info("Please note, the **text** that you write into the text box **is saved** to improve the search engine.")
74
+
75
+ settings = Settings()
76
+
77
+ query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')..."]+settings.predefined_inputs, key="pre-query")
78
+ # query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')...", "Do you have a job?", "Are you happy with the healthcare system?"])
79
+
80
+ col1, col2 = st.columns([9,1])
81
+
82
+ if query == "Another query (via 'Search input')...":
83
+ with col1:
84
+ # query = st.selectbox("Pre-defined Queries:", ["Other query", "Do you have a job?", "Are you happy with the healthcare system?"])
85
+ query = st.text_input(label="Search input:", placeholder="Do you have a job?", key="query")
86
+
87
+ with col2:
88
+ st.write('#')
89
+ button_clicked = st.button("🔎")
90
+ else:
91
+ button_clicked = False
92
+
93
+ langs = sorted(settings.languages.split(','))
94
+ pattern = re.compile(r'[\W_]+')
95
+
96
+ corpus_df, hsearcher = prepare(settings, langs, pattern)
97
+
98
+ all_countries = sorted(list(set([c for cs in corpus_df["countries"].tolist() for c in cs if c and "_" not in c])))
99
+ countries = st.multiselect("Geography:", all_countries, key="countries")
100
+ if countries:
101
+ corpus_df = filter_corpus(corpus_df, countries, column="countries")
102
+
103
+ all_study_groups = sorted(list(set(corpus_df["title"].tolist())))
104
+ study_groups = st.multiselect("Study Group:", all_study_groups, key="study_groups")
105
+ if study_groups:
106
+ corpus_df = filter_corpus(corpus_df, study_groups, column="title", row_type=str)
107
+
108
+ unique_years = list(set([int(x) for x in corpus_df["date"].tolist() if isinstance(x, str) or isinstance(x, int)]))
109
+ if unique_years:
110
+ min_year, max_year = min(unique_years), max(unique_years)
111
+ if min_year < max_year:
112
+ year = st.slider("Publication Year:", min_year, max_year, (min_year, max_year), 1, key="year")
113
+ corpus_df = filter_years(corpus_df, year)
114
+ else:
115
+ year = min_year
116
+ st.markdown(f"Year: {min_year}")
117
+ st.markdown("---")
118
+ else:
119
+ year = None
120
+
121
+ corpus_groups = corpus_df.groupby(by='alpha_sentence')
122
+
123
+ try:
124
+ if (query or button_clicked) and query != "":
125
+ logging.info(f"Query: '{query}'")
126
+ logging.info(f"Geography: {countries}")
127
+ logging.info(f"Min/max Years: {year}")
128
+
129
+ with st.spinner("Searching..."):
130
+ hits = hsearcher.search(query, alpha=settings.alpha, k0=settings.top_k, k=settings.top_k, normalization=settings.normalization, weight_on_dense=settings.weight_on_dense)
131
+ result_sentences = []
132
+ for hit in hits:
133
+ _id = hit.docid
134
+ if _id in corpus_df.index:
135
+ result_sentence = corpus_df.loc[_id]["sentence"]
136
+ result_sentence = re.sub(pattern, '', result_sentence).lower()
137
+ if result_sentence not in result_sentences:
138
+ result_sentences.append(result_sentence)
139
+
140
+ st.write(f"<i>Showing the top {len(result_sentences)} result(s) out of {len(corpus_groups.groups)} question(s).</i>", unsafe_allow_html=True)
141
+ st.write("---")
142
+
143
+ # ogroups = sorted(corpus_groups.groups.items(), key=lambda x: x[1][0])
144
+ for j,sentence in enumerate(result_sentences):
145
+ if sentence in corpus_groups.groups:
146
+ group = corpus_groups.get_group(sentence)
147
+ osentence = group.iloc[0].get('sentence', '')
148
+
149
+ st.markdown(f'Question: {osentence}', unsafe_allow_html=True)
150
+ expander_text = f'Show {group.shape[0]} grouped survey items.' if group.shape[0] > 1 else f'Show {group.shape[0]} grouped survey item.'
151
+ modal = st.expander(expander_text)
152
+ for i in range(group.shape[0]):
153
+ row = group.iloc[i]
154
+ rid = row.get('id', '')
155
+ rlabel = row.get('label', '')
156
+ rsq = row.get('sub-question', '')
157
+ ritem = row.get('item_category', '')
158
+ rtitle = row.get('title', '')
159
+ if rtitle and rid:
160
+ rtitle = f'<a href="https://search.gesis.org/research_data/{rid.split("_")[0]}">{rtitle}</a>'
161
+ rdate = row.get('date', '') # TODO: what is this date?
162
+ rcountries = row.get('countries', '')
163
+ rqt1 = row.get('question_type1', '')
164
+ rqt2 = row.get('question_type2', '')
165
+
166
+ modal.markdown(f'<a href="https://search.gesis.org/variables/exploredata-{rid}">{rid}</a>\
167
+ <br>Label: {rlabel}\
168
+ <br>Sub-Question: {rsq}\
169
+ <br>Item: {ritem}\
170
+ <br>Research Data: {rtitle}\
171
+ <br>Study Date: {rdate}\
172
+ <br>Geography: {rcountries}\
173
+ <br>Question Type 1: {rqt1}\
174
+ <br>Question Type 2: {rqt2}',
175
+ unsafe_allow_html=True
176
+ )
177
+ if i+1 < group.shape[0] > 1:
178
+ modal.markdown('---')
179
+
180
+ if j+1 < len(result_sentences) > 1:
181
+ st.markdown('---')
182
+ else:
183
+ logging.debug(f"Sentence is not in groups: {sentence}")
184
+ except:
185
+ st.error("Something went wrong. Please try again with a different input.")
186
+ logging.warning(f'An error occurred for the query: {query}')
187
+ except:
188
+ st.error("Something went wrong. Please try again later.")
189
+ logging.warning(f'The app crashed.')
config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Settings:
2
+ root_dir = './data/embeddings/'
3
+ corpus_path = './data/full.tsv'
4
+ model_name_or_path: str = 'intfloat/e5-small'
5
+ index_name = 'test-index'
6
+ # max_corpus_size = 500000
7
+ # n_clusters = 64
8
+ # nprobe = 3
9
+ languages = 'en'
10
+ top_k = 100
11
+ alpha = 2.0
12
+ normalization = True
13
+ weight_on_dense = False
14
+ predefined_inputs = [
15
+ "Do you have a job?",
16
+ "Are you happy with the healthcare system?",
17
+ "Do you think income differences are too large?",
18
+ "Which problems are you aware of that affect nature",
19
+ "financial literacy",
20
+ "health literacy",
21
+ "psychometric scales for anxiety",
22
+ "tolerance for income inequality"
23
+ ]
data/full.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02dd973d2d9db91f046bbba81da04ff698a18012b43351d05c79b06a37a94739
3
+ size 93065996
encoder.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from pyserini.search import QueryEncoder
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+
7
+ class SentenceTransformerEncoder(QueryEncoder):
8
+ def __init__(self, model_name: str, device: str = 'cpu'):
9
+ self.device = torch.device(device)
10
+ self.model = SentenceTransformer(model_name, device=self.device)
11
+
12
+ def encode(self, query: str):
13
+ emb = self.model.encode(query)
14
+ emb = emb / np.linalg.norm(emb)
15
+ # emb = np.expand_dims(emb, axis=0)
16
+ return emb
helper.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from itertools import count
2
+ import re
3
+ import numpy as np
4
+ import pandas as pd
5
+ import operator as op
6
+ from uuid import uuid4
7
+ import pycountry
8
+ from pyserini.search import LuceneSearcher, FaissSearcher
9
+ from pyserini.search.hybrid import HybridSearcher
10
+
11
+ from encoder import SentenceTransformerEncoder
12
+
13
+
14
+ def load_data(path, langs):
15
+ df = pd.read_csv(path, sep='\t')
16
+ if 'uuid' not in df.columns:
17
+ df['uuid'] = [uuid4() for _ in range(df.shape[0])]
18
+ df.to_csv(path, index=False, sep='\t')
19
+ df = df[df['lang'].isin(langs)] # filter relevant language
20
+ return df
21
+
22
+
23
+ def get_country(code: str) -> str:
24
+ if code:
25
+ country = pycountry.countries.get(alpha_2=code)
26
+ if country:
27
+ # print(country.name)
28
+ return country.name
29
+ else:
30
+ if "-" in code:
31
+ cs = code.split("-")
32
+ country = pycountry.countries.get(alpha_2=cs[0])
33
+ if country:
34
+ return country.name+f" ({'-'.join(cs[1:])})"
35
+ return code
36
+ else:
37
+ return ""
38
+
39
+
40
+ def load_corpus(corpus_path, langs, pattern):
41
+ corpus_df = load_data(corpus_path, langs)
42
+ corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: np.nan if x == "" else x)
43
+ corpus_df["uuid"] = corpus_df["uuid"].apply(lambda x: str(x))
44
+ corpus_df = corpus_df.dropna(subset=["sentence"])
45
+ corpus_df = corpus_df.drop_duplicates(subset="id")
46
+ corpus_df = corpus_df.drop_duplicates()
47
+ corpus_df.index = corpus_df["uuid"].apply(lambda x: str(x))
48
+ corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: x.lower())
49
+ corpus_df["alpha_sentence"] = corpus_df["sentence"].apply(lambda x: re.sub(pattern, '', x).lower())
50
+ corpus_df["countries"] = corpus_df["countries"].apply(lambda x: x.replace("'","").replace(" ", "").replace("[", "").replace("]", "").split(","))
51
+ corpus_df["countries"] = corpus_df["countries"].apply(lambda x: [get_country(c) for c in x])
52
+ return corpus_df
53
+
54
+
55
+ def filter_years(corpus_df, year):
56
+ corpus_df = corpus_df[(corpus_df["date"] >= year[0]) & (corpus_df["date"] <= year[1])]
57
+ return corpus_df
58
+
59
+
60
+ def filter_corpus(corpus_df, values, column, row_type=list):
61
+ def check_op(all_rows, list2, rtype):
62
+ rows = []
63
+ for i,row in enumerate(all_rows):
64
+ if rtype == list:
65
+ for e in list2:
66
+ if op.countOf(row, e) > 0:
67
+ rows.append(i)
68
+ elif rtype == str:
69
+ for e in list2:
70
+ if e in row:
71
+ rows.append(i)
72
+ return rows
73
+
74
+ idxs = check_op(corpus_df[column].tolist(), values, row_type)
75
+ corpus_df = corpus_df.loc[corpus_df.index[idxs]]
76
+ return corpus_df
77
+
78
+
79
+ def load_searchers(index_name, model_name):
80
+ ssearcher = LuceneSearcher(index_name)
81
+
82
+ encoder = SentenceTransformerEncoder(model_name)
83
+ dsearcher = FaissSearcher(index_name, encoder)
84
+
85
+ hsearcher = HybridSearcher(sparse_searcher=ssearcher, dense_searcher=dsearcher)
86
+ return hsearcher
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openjdk-17-jdk
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==1.18.1
2
+ pyserini==0.20.0
3
+ faiss-cpu==1.7.3
4
+ sentence-transformers==2.2.2
5
+ pycountry==22.3.5
test-index/_1.fdm ADDED
Binary file (157 Bytes). View file
 
test-index/_1.fdt ADDED
Binary file (87 Bytes). View file
 
test-index/_1.fdx ADDED
Binary file (64 Bytes). View file
 
test-index/_1.fnm ADDED
Binary file (322 Bytes). View file
 
test-index/_1.nvd ADDED
Binary file (59 Bytes). View file
 
test-index/_1.nvm ADDED
Binary file (103 Bytes). View file
 
test-index/_1.si ADDED
Binary file (480 Bytes). View file
 
test-index/_1_Lucene90_0.doc ADDED
Binary file (77 Bytes). View file
 
test-index/_1_Lucene90_0.dvd ADDED
Binary file (74 Bytes). View file
 
test-index/_1_Lucene90_0.dvm ADDED
Binary file (133 Bytes). View file
 
test-index/_1_Lucene90_0.tim ADDED
Binary file (100 Bytes). View file
 
test-index/_1_Lucene90_0.tip ADDED
Binary file (73 Bytes). View file
 
test-index/_1_Lucene90_0.tmd ADDED
Binary file (224 Bytes). View file
 
test-index/docid ADDED
The diff for this file is too large to render. See raw diff
 
test-index/index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19145f7df44fa4b8c980279b8abc4cccb2f08f11ec199c295826e007c2efa20b
3
+ size 28588843
test-index/segments_2 ADDED
Binary file (154 Bytes). View file
 
test-index/write.lock ADDED
File without changes