Spaces:

vadis
/

variable-search

No application file

App Files Files Community

e-tornike commited on Mar 19

Commit

d137e33

•

1 Parent(s): e86164b

initial commit

Browse files

Files changed (25) hide show

.gitattributes +2 -0
app.py +189 -0
config.py +23 -0
data/full.tsv +3 -0
encoder.py +16 -0
helper.py +86 -0
packages.txt +1 -0
requirements.txt +5 -0
test-index/_1.fdm +0 -0
test-index/_1.fdt +0 -0
test-index/_1.fdx +0 -0
test-index/_1.fnm +0 -0
test-index/_1.nvd +0 -0
test-index/_1.nvm +0 -0
test-index/_1.si +0 -0
test-index/_1_Lucene90_0.doc +0 -0
test-index/_1_Lucene90_0.dvd +0 -0
test-index/_1_Lucene90_0.dvm +0 -0
test-index/_1_Lucene90_0.tim +0 -0
test-index/_1_Lucene90_0.tip +0 -0
test-index/_1_Lucene90_0.tmd +0 -0
test-index/docid +0 -0
test-index/index +3 -0
test-index/segments_2 +0 -0
test-index/write.lock +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+index filter=lfs diff=lfs merge=lfs -text
+*.tsv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import re
+# import string
+import streamlit as st
+import logging
+from config import Settings
+from helper import load_corpus, load_searchers, filter_corpus, filter_years
+st.set_page_config(page_title="Variable Search", page_icon="🔎")
+hide_streamlit_style = """
+<style>
+#MainMenu {visibility: hidden;}
+footer {visibility: hidden;}
+</style>
+"""
+st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+sidebar_description = """
+    **This website saves the text that you write in the search input field. This data is used to improve the search engine.**
+    __Info__:
+    This site allows you to search for survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.
+    Using language models, the input is semantically compared against the question text of survey items.
+    Ideally, the results should contain the same or similar semantic information as your input.
+    You can filter the search results by year or country of the survey.
+    __How to use__:
+    1. Select a pre-defined input or enter a search query in the search input field.
+    2. Select one or more countries from the list under "Geography" or leave it empty.
+    3. Select a study group or leave it empty.
+    4. Select the range of years to be included in the search results.
+    5. Click on the search button.
+    6. Evaluate the results by clicking on "Show X more survey item(s)" to expand the list of results that contain an identical question.
+    __NOTE__:
+    __Longitudal Studies__: Variables that are grouped together often originate from longitudal studies (i.e., repeated measures over long periods of time).
+    __Concept Search__: While the search system will perform best for finding texts that are semantically-similar to the input, you may also try more abstract inputs such as concepts (e.g., "financial literacy"). While the system is not specifically designed to retrieve variables that are related to concepts, language models (having seen a large part of the internet) may map concepts to texts similar to the variables that are used to operationalize and measure them.
+"""
+st.sidebar.markdown(sidebar_description)
+st.title("Variable Search")
+st.markdown(f"Search across survey items (i.e., variables) from surveys such as Eurobarometer, ISSP, EVS, and more.<br>In total, you can search for over 80,000 items.", unsafe_allow_html=True)
+@st.cache_data
+def prepare_data(corpus_path, langs, pattern):
+    return load_corpus(corpus_path, langs, pattern)
+@st.cache_resource
+def prepare_models(index_name, model_name_or_path):
+    return load_searchers(index_name, model_name_or_path)
+def prepare(settings, langs, pattern):
+    langs = sorted(settings.languages.split(','))
+    # logging.info("Preparing data...")
+    df = prepare_data(settings.corpus_path, langs, pattern)
+    # logging.info("Done.")
+    # logging.info("Preparing models...")
+    hsearcher = prepare_models(settings.index_name, settings.model_name_or_path)
+    # logging.info("Done.")
+    return df, hsearcher
+try:
+    st.info("Please note, the **text** that you write into the text box **is saved** to improve the search engine.")
+    settings = Settings()
+    query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')..."]+settings.predefined_inputs, key="pre-query")
+    # query = st.selectbox("Pre-defined inputs:", ["Another query (via 'Search input')...", "Do you have a job?", "Are you happy with the healthcare system?"])
+    col1, col2 = st.columns([9,1])
+    if query == "Another query (via 'Search input')...":
+        with col1:
+        # query = st.selectbox("Pre-defined Queries:", ["Other query", "Do you have a job?", "Are you happy with the healthcare system?"])
+            query = st.text_input(label="Search input:", placeholder="Do you have a job?", key="query")
+        with col2:
+            st.write('#')
+            button_clicked = st.button("🔎")
+    else:
+        button_clicked = False
+    langs = sorted(settings.languages.split(','))
+    pattern = re.compile(r'[\W_]+')
+    corpus_df, hsearcher = prepare(settings, langs, pattern)
+    all_countries = sorted(list(set([c for cs in corpus_df["countries"].tolist() for c in cs if c and "_" not in c])))
+    countries = st.multiselect("Geography:", all_countries, key="countries")
+    if countries:
+        corpus_df = filter_corpus(corpus_df, countries, column="countries")
+    all_study_groups = sorted(list(set(corpus_df["title"].tolist())))
+    study_groups = st.multiselect("Study Group:", all_study_groups, key="study_groups")
+    if study_groups:
+        corpus_df = filter_corpus(corpus_df, study_groups, column="title", row_type=str)
+    unique_years = list(set([int(x) for x in corpus_df["date"].tolist() if isinstance(x, str) or isinstance(x, int)]))
+    if unique_years:
+        min_year, max_year = min(unique_years), max(unique_years)
+        if min_year < max_year:
+            year = st.slider("Publication Year:", min_year, max_year, (min_year, max_year), 1, key="year")
+            corpus_df = filter_years(corpus_df, year)
+        else:
+            year = min_year
+            st.markdown(f"Year: {min_year}")
+            st.markdown("---")
+    else:
+        year = None
+    corpus_groups = corpus_df.groupby(by='alpha_sentence')
+    try:
+        if (query or button_clicked) and query != "":
+            logging.info(f"Query: '{query}'")
+            logging.info(f"Geography: {countries}")
+            logging.info(f"Min/max Years: {year}")
+            with st.spinner("Searching..."):
+                hits = hsearcher.search(query, alpha=settings.alpha, k0=settings.top_k, k=settings.top_k, normalization=settings.normalization, weight_on_dense=settings.weight_on_dense)
+                result_sentences = []
+                for hit in hits:
+                    _id = hit.docid
+                    if _id in corpus_df.index:
+                        result_sentence = corpus_df.loc[_id]["sentence"]
+                        result_sentence = re.sub(pattern, '', result_sentence).lower()
+                        if result_sentence not in result_sentences:
+                            result_sentences.append(result_sentence)
+                st.write(f"<i>Showing the top {len(result_sentences)} result(s) out of {len(corpus_groups.groups)} question(s).</i>", unsafe_allow_html=True)
+                st.write("---")
+                # ogroups = sorted(corpus_groups.groups.items(), key=lambda x: x[1][0])
+                for j,sentence in enumerate(result_sentences):
+                    if sentence in corpus_groups.groups:
+                        group = corpus_groups.get_group(sentence)
+                        osentence = group.iloc[0].get('sentence', '')
+                        st.markdown(f'Question: {osentence}', unsafe_allow_html=True)
+                        expander_text = f'Show {group.shape[0]} grouped survey items.' if group.shape[0] > 1 else f'Show {group.shape[0]} grouped survey item.'
+                        modal = st.expander(expander_text)
+                        for i in range(group.shape[0]):
+                            row = group.iloc[i]
+                            rid = row.get('id', '')
+                            rlabel = row.get('label', '')
+                            rsq = row.get('sub-question', '')
+                            ritem = row.get('item_category', '')
+                            rtitle = row.get('title', '')
+                            if rtitle and rid:
+                                rtitle = f'<a href="https://search.gesis.org/research_data/{rid.split("_")[0]}">{rtitle}</a>'
+                            rdate = row.get('date', '')  # TODO: what is this date?
+                            rcountries = row.get('countries', '')
+                            rqt1 = row.get('question_type1', '')
+                            rqt2 = row.get('question_type2', '')
+                            modal.markdown(f'<a href="https://search.gesis.org/variables/exploredata-{rid}">{rid}</a>\
+                                            <br>Label: {rlabel}\
+                                            <br>Sub-Question: {rsq}\
+                                            <br>Item: {ritem}\
+                                            <br>Research Data: {rtitle}\
+                                            <br>Study Date: {rdate}\
+                                            <br>Geography: {rcountries}\
+                                            <br>Question Type 1: {rqt1}\
+                                            <br>Question Type 2: {rqt2}',
+                                            unsafe_allow_html=True
+                                        )
+                            if i+1 < group.shape[0] > 1:
+                                modal.markdown('---')
+                        if j+1 < len(result_sentences) > 1:
+                            st.markdown('---')
+                    else:
+                        logging.debug(f"Sentence is not in groups: {sentence}")
+    except:
+        st.error("Something went wrong. Please try again with a different input.")
+        logging.warning(f'An error occurred for the query: {query}')
+except:
+    st.error("Something went wrong. Please try again later.")
+    logging.warning(f'The app crashed.')

config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+class Settings:
+    root_dir = './data/embeddings/'
+    corpus_path = './data/full.tsv'
+    model_name_or_path: str = 'intfloat/e5-small'
+    index_name = 'test-index'
+    # max_corpus_size = 500000
+    # n_clusters = 64
+    # nprobe = 3
+    languages = 'en'
+    top_k = 100
+    alpha = 2.0
+    normalization = True
+    weight_on_dense = False
+    predefined_inputs = [
+            "Do you have a job?",
+            "Are you happy with the healthcare system?",
+            "Do you think income differences are too large?",
+            "Which problems are you aware of that affect nature",
+            "financial literacy",
+            "health literacy",
+            "psychometric scales for anxiety",
+            "tolerance for income inequality"
+            ]

data/full.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02dd973d2d9db91f046bbba81da04ff698a18012b43351d05c79b06a37a94739
+size 93065996

encoder.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import numpy as np
+from pyserini.search import QueryEncoder
+from sentence_transformers import SentenceTransformer
+class SentenceTransformerEncoder(QueryEncoder):
+    def __init__(self, model_name: str, device: str = 'cpu'):
+        self.device = torch.device(device)
+        self.model = SentenceTransformer(model_name, device=self.device)
+    def encode(self, query: str):
+        emb = self.model.encode(query)
+        emb = emb / np.linalg.norm(emb)
+        # emb = np.expand_dims(emb, axis=0)
+        return emb

helper.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# from itertools import count
+import re
+import numpy as np
+import pandas as pd
+import operator as op
+from uuid import uuid4
+import pycountry
+from pyserini.search import LuceneSearcher, FaissSearcher
+from pyserini.search.hybrid import HybridSearcher
+from encoder import SentenceTransformerEncoder
+def load_data(path, langs):
+    df = pd.read_csv(path, sep='\t')
+    if 'uuid' not in df.columns:
+        df['uuid'] = [uuid4() for _ in range(df.shape[0])]
+        df.to_csv(path, index=False, sep='\t')
+    df = df[df['lang'].isin(langs)]  # filter relevant language
+    return df
+def get_country(code: str) -> str:
+    if code:
+        country = pycountry.countries.get(alpha_2=code)
+        if country:
+            # print(country.name)
+            return country.name
+        else:
+            if "-" in code:
+                cs = code.split("-")
+                country = pycountry.countries.get(alpha_2=cs[0])
+                if country:
+                    return country.name+f" ({'-'.join(cs[1:])})"
+            return code
+    else:
+        return ""
+def load_corpus(corpus_path, langs, pattern):
+    corpus_df = load_data(corpus_path, langs)
+    corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: np.nan if x == "" else x)
+    corpus_df["uuid"] = corpus_df["uuid"].apply(lambda x: str(x))
+    corpus_df = corpus_df.dropna(subset=["sentence"])
+    corpus_df = corpus_df.drop_duplicates(subset="id")
+    corpus_df = corpus_df.drop_duplicates()
+    corpus_df.index = corpus_df["uuid"].apply(lambda x: str(x))
+    corpus_df["sentence"] = corpus_df["sentence"].apply(lambda x: x.lower())
+    corpus_df["alpha_sentence"] = corpus_df["sentence"].apply(lambda x: re.sub(pattern, '', x).lower())
+    corpus_df["countries"] = corpus_df["countries"].apply(lambda x: x.replace("'","").replace(" ", "").replace("[", "").replace("]", "").split(","))
+    corpus_df["countries"] = corpus_df["countries"].apply(lambda x: [get_country(c) for c in x])
+    return corpus_df
+def filter_years(corpus_df, year):
+    corpus_df = corpus_df[(corpus_df["date"] >= year[0]) & (corpus_df["date"] <= year[1])]
+    return corpus_df
+def filter_corpus(corpus_df, values, column, row_type=list):
+    def check_op(all_rows, list2, rtype):
+        rows = []
+        for i,row in enumerate(all_rows):
+            if rtype == list:
+                for e in list2:
+                    if op.countOf(row, e) > 0:
+                        rows.append(i)
+            elif rtype == str:
+                for e in list2:
+                    if e in row:
+                        rows.append(i)
+        return rows
+    idxs = check_op(corpus_df[column].tolist(), values, row_type)
+    corpus_df = corpus_df.loc[corpus_df.index[idxs]]
+    return corpus_df
+def load_searchers(index_name, model_name):
+    ssearcher = LuceneSearcher(index_name)
+    encoder = SentenceTransformerEncoder(model_name)
+    dsearcher = FaissSearcher(index_name, encoder)
+    hsearcher = HybridSearcher(sparse_searcher=ssearcher, dense_searcher=dsearcher)
+    return hsearcher

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ openjdk-17-jdk

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit==1.18.1
+pyserini==0.20.0
+faiss-cpu==1.7.3
+sentence-transformers==2.2.2
+pycountry==22.3.5

test-index/_1.fdm ADDED Viewed

Binary file (157 Bytes). View file

test-index/_1.fdt ADDED Viewed

Binary file (87 Bytes). View file

test-index/_1.fdx ADDED Viewed

Binary file (64 Bytes). View file

test-index/_1.fnm ADDED Viewed

Binary file (322 Bytes). View file

test-index/_1.nvd ADDED Viewed

Binary file (59 Bytes). View file

test-index/_1.nvm ADDED Viewed

Binary file (103 Bytes). View file

test-index/_1.si ADDED Viewed

Binary file (480 Bytes). View file

test-index/_1_Lucene90_0.doc ADDED Viewed

Binary file (77 Bytes). View file

test-index/_1_Lucene90_0.dvd ADDED Viewed

Binary file (74 Bytes). View file

test-index/_1_Lucene90_0.dvm ADDED Viewed

Binary file (133 Bytes). View file

test-index/_1_Lucene90_0.tim ADDED Viewed

Binary file (100 Bytes). View file

test-index/_1_Lucene90_0.tip ADDED Viewed

Binary file (73 Bytes). View file

test-index/_1_Lucene90_0.tmd ADDED Viewed

Binary file (224 Bytes). View file

test-index/docid ADDED Viewed

The diff for this file is too large to render. See raw diff

test-index/index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19145f7df44fa4b8c980279b8abc4cccb2f08f11ec199c295826e007c2efa20b
+size 28588843

test-index/segments_2 ADDED Viewed

Binary file (154 Bytes). View file

test-index/write.lock ADDED Viewed

File without changes