libt

Runtime error

App Files Files Community

ehristoforu commited on Jul 18, 2023

Commit

1f8505f

•

0 Parent(s):

Duplicate from TNR-5/lib

Browse files

Files changed (9) hide show

.gitattributes +29 -0
.gitignore +145 -0
README.md +39 -0
app.py +118 -0
embeddings/multi-qa-MiniLM-L6-cos-v1-embeddings.pt +3 -0
hf_data/models.jsonl +3 -0
hf_data/passages.jsonl +3 -0
requirements.txt +6 -0
st_utils.py +126 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,29 @@

+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,145 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+title: HuggingFace Search Engine
+emoji: 🔎🤗
+colorFrom: blue
+colorTo: gray
+sdk: streamlit
+app_file: app.py
+pinned: true
+duplicated_from: TNR-5/lib
+---
+# Configuration
+`title`: _string_
+Display title for the Space.
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio`, `streamlit`, or `static`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
+Path is relative to the root of the repository.
+`models`: _List[string]_
+HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`datasets`: _List[string]_
+HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
+Will be parsed automatically from your code if not specified here.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import streamlit as st
+from st_utils import bm25_search, semantic_search, hf_api, paginator
+from huggingface_hub import ModelSearchArguments
+import webbrowser
+from numerize.numerize import numerize
+import math
+st.set_page_config(
+    page_title="HF Search Engine",
+    page_icon="🔎",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+### SIDEBAR
+search_backend = st.sidebar.selectbox(
+    "Search method",
+    ["semantic", "bm25", "hfapi"],
+    format_func=lambda x: {"hfapi": "Keyword search", "bm25": "BM25 search", "semantic": "Semantic Search"}[x],
+)
+limit_results = int(st.sidebar.number_input("Limit results", min_value=0, value=10))
+sort_by = st.sidebar.selectbox(
+    "Sort by",
+    [None, "downloads", "likes", "lastModified"],
+    format_func=lambda x: {None: "Relevance", "downloads": "Most downloads", "likes": "Most likes", "lastModified": "Recently updated"}[x],
+)
+st.sidebar.markdown("# Filters")
+args = ModelSearchArguments()
+library = st.sidebar.multiselect(
+    "Library", args.library.values(), format_func=lambda x: {v: k for k, v in args.library.items()}[x]
+)
+task = st.sidebar.multiselect(
+    "Task", args.pipeline_tag.values(), format_func=lambda x: {v: k for k, v in args.pipeline_tag.items()}[x]
+)
+### MAIN PAGE
+st.markdown(
+    "<h1 style='text-align: center; '>🔎🤗 HF Search Engine</h1>",
+    unsafe_allow_html=True,
+)
+# Search bar
+search_query = st.text_input("Search for a model in HuggingFace", value="", max_chars=None, key=None, type="default")
+if search_query != "":
+    filters = {
+        "library": library,
+        "task": task,
+    }
+    if search_backend == "hfapi":
+        res = hf_api(search_query, limit_results, sort_by, filters)
+    elif search_backend == "semantic":
+        res = semantic_search(search_query, limit_results, sort_by, filters)
+    elif search_backend == "bm25":
+        res = bm25_search(search_query, limit_results, sort_by, filters)
+    hit_list, hits_count = res["hits"], res["count"]
+    hit_list = [
+        {
+            "modelId": hit["modelId"],
+            "tags": hit["tags"],
+            "downloads": hit["downloads"],
+            "likes": hit["likes"],
+            "readme": hit.get("readme", None),
+        }
+        for hit in hit_list
+    ]
+    if hit_list:
+        st.write(f"Search results ({hits_count}):")
+        if hits_count > 100:
+            shown_results = 100
+        else:
+            shown_results = hits_count
+        for i, hit in paginator(
+            f"Select results (showing {shown_results} of {hits_count} results)",
+            hit_list,
+        ):
+            col1, col2, col3 = st.columns([5, 1, 1])
+            col1.metric("Model", hit["modelId"])
+            col2.metric("N° downloads", numerize(hit["downloads"]) if hit["downloads"] and not math.isnan(hit["downloads"]) else "N/A")
+            col3.metric("N° likes", numerize(hit["likes"]) if hit["likes"] and not math.isnan(hit["likes"]) else "N/A")
+            st.button(
+                f"View model on 🤗",
+                on_click=lambda hit=hit: webbrowser.open(f"https://huggingface.co/{hit['modelId']}", new=2),
+                key=f"{i}-{hit['modelId']}",
+            )
+            st.write(f"**Tags:** {'&nbsp;&nbsp;•&nbsp;&nbsp;'.join(hit['tags'])}")
+            if hit["readme"]:
+                with st.expander("See README"):
+                    st.write(hit["readme"])
+            # TODO: embed huggingface spaces
+            #                 import streamlit.components.v1 as components
+            #                 components.html(
+            #     f"""
+            #     <link rel="stylesheet" href="https://gradio.s3-us-west-2.amazonaws.com/2.6.2/static/bundle.css">
+            # <div id="target"></div>
+            # <script src="https://gradio.s3-us-west-2.amazonaws.com/2.6.2/static/bundle.js"></script>
+            # <script>
+            # launchGradioFromSpaces("abidlabs/question-answering", "#target")
+            # </script>
+            #     """,
+            #     height=400,
+            # )
+            st.markdown("---")
+    else:
+        st.write(f"No Search results, please try again with different keywords")
+st.markdown(
+    "<h6 style='text-align: center; color: #808080;'>Made with ❤️ By <a href='https://github.com/NouamaneTazi'>Nouamane</a> - Checkout complete project <a href='https://github.com/NouamaneTazi/hf_search'>here</a></h6>",
+    unsafe_allow_html=True,
+)

embeddings/multi-qa-MiniLM-L6-cos-v1-embeddings.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c95fa4fe96c9e032d94d9c5af0d83667e088fe65c1bcfe5e40c18cc0096ae73c
+size 486097131

hf_data/models.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23d9543aa28ea8b2205c690029c9b96f5bc76bd3e2b7c38cbc21c2113e952a15
+size 72808045

hf_data/passages.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbf33260e5185e04b42923e15039cd0cc081f407d6a4ac3ad35b08007b9a54e3
+size 71939324

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+streamlit
+huggingface_hub
+numerize
+pbr
+git+https://github.com/NouamaneTazi/[email protected]

st_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import json
+from huggingface_hub import HfApi, ModelFilter, DatasetFilter, ModelSearchArguments
+from pprint import pprint
+from hf_search import HFSearch
+import streamlit as st
+import itertools
+from pbr.version import VersionInfo
+print("hf_search version:", VersionInfo('hf_search').version_string())
+hf_search = HFSearch(top_k=200)
+@st.cache
+def hf_api(query, limit=5, sort=None, filters={}):
+    print("query", query)
+    print("filters", filters)
+    print("limit", limit)
+    print("sort", sort)
+    api = HfApi()
+    filt = ModelFilter(
+        task=filters["task"],
+        library=filters["library"],
+    )
+    models = api.list_models(search=query, filter=filt, limit=limit, sort=sort, full=True)
+    hits = []
+    for model in models:
+        model = model.__dict__
+        hits.append(
+            {
+                "modelId": model.get("modelId"),
+                "tags": model.get("tags"),
+                "downloads": model.get("downloads"),
+                "likes": model.get("likes"),
+            }
+        )
+    count = len(hits)
+    if len(hits) > limit:
+        hits = hits[:limit]
+    return {"hits": hits, "count": count}
+@st.cache
+def semantic_search(query, limit=5, sort=None, filters={}):
+    print("query", query)
+    print("filters", filters)
+    print("limit", limit)
+    print("sort", sort)
+    hits = hf_search.search(query=query, method="retrieve & rerank", limit=limit, sort=sort, filters=filters)
+    hits = [
+        {
+            "modelId": hit["modelId"],
+            "tags": hit["tags"],
+            "downloads": hit["downloads"],
+            "likes": hit["likes"],
+            "readme": hit.get("readme", None),
+        }
+        for hit in hits
+    ]
+    return {"hits": hits, "count": len(hits)}
+@st.cache
+def bm25_search(query, limit=5, sort=None, filters={}):
+    print("query", query)
+    print("filters", filters)
+    print("limit", limit)
+    print("sort", sort)
+    # TODO: filters
+    hits = hf_search.search(query=query, method="bm25", limit=limit, sort=sort, filters=filters)
+    hits = [
+        {
+            "modelId": hit["modelId"],
+            "tags": hit["tags"],
+            "downloads": hit["downloads"],
+            "likes": hit["likes"],
+            "readme": hit.get("readme", None),
+        }
+        for hit in hits
+    ]
+    hits = [
+        hits[i] for i in range(len(hits)) if hits[i]["modelId"] not in [h["modelId"] for h in hits[:i]]
+    ]  # unique hits
+    return {"hits": hits, "count": len(hits)}
+def paginator(label, articles, articles_per_page=10, on_sidebar=True):
+    # https://gist.github.com/treuille/2ce0acb6697f205e44e3e0f576e810b7
+    """Lets the user paginate a set of article.
+    Parameters
+    ----------
+    label : str
+        The label to display over the pagination widget.
+    article : Iterator[Any]
+        The articles to display in the paginator.
+    articles_per_page: int
+        The number of articles to display per page.
+    on_sidebar: bool
+        Whether to display the paginator widget on the sidebar.
+    Returns
+    -------
+    Iterator[Tuple[int, Any]]
+        An iterator over *only the article on that page*, including
+        the item's index.
+    """
+    # Figure out where to display the paginator
+    if on_sidebar:
+        location = st.sidebar.empty()
+    else:
+        location = st.empty()
+    # Display a pagination selectbox in the specified location.
+    articles = list(articles)
+    n_pages = (len(articles) - 1) // articles_per_page + 1
+    page_format_func = lambda i: f"Results {i*10} to {i*10 +10 -1}"
+    page_number = location.selectbox(label, range(n_pages), format_func=page_format_func)
+    # Iterate over the articles in the page to let the user display them.
+    min_index = page_number * articles_per_page
+    max_index = min_index + articles_per_page
+    return itertools.islice(enumerate(articles), min_index, max_index)