Spaces:

librarian-bots
/

collection-reading-list-generator

Runtime error

App Files Files Community

collection-reading-list-generator / app.py

davanstrien HF staff

turn off debugging

cbd65c5 about 1 year ago

raw

history blame

6.2 kB

	import json
	from typing import Any, Dict, List, Optional, Union

	import gradio as gr
	import httpx
	from cachetools import TTLCache, cached
	from gradio_client import Client
	from toolz import groupby

	CACHE_TIME = 60 * 60 * 1 # 1 hour

	client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")


	@cached(cache=TTLCache(maxsize=500, ttl=10))
	def get_arxiv_ids_from_slug(
	slug: str,
	) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
	result = client.predict(slug, api_name="/predict")
	with open(result) as f:
	data = json.load(f)
	return data


	def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
	return f"ArXiv:{arxiv_id}"


	def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
	arxiv_ids = []
	if exclude_keys is not None:
	data = {k: v for k, v in data.items() if k not in exclude_keys}
	# check if dict now empty
	if not data:
	return []
	for repo in data.values():
	if repo is None:
	continue
	for item in repo.values():
	arxiv_ids.extend(item["arxiv_ids"])
	# format for semantic scholar
	return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]


	@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
	def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
	paper_ids = list(paper_ids)
	print(paper_ids)
	r = httpx.post(
	"https://api.semanticscholar.org/recommendations/v1/papers/",
	json={
	"positivePaperIds": paper_ids,
	},
	params={"fields": "externalIds,title,year", "limit": 10},
	timeout=30,
	)
	print(r.text)
	return r.json()


	def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
	return recommendation["externalIds"].get("ArXiv", None) is not None


	def group_by_is_arxiv_paper(
	recommendations: List[Dict[str, Any]]
	) -> Dict[bool, List[Dict[str, Any]]]:
	return groupby(is_arxiv_paper, recommendations)


	def format_recommendation_into_markdown(
	grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
	):
	comment = "The following papers were recommended by the Semantic Scholar API \n\n"
	arxiv_papers = grouped_recommendations.get(True)
	if arxiv_papers:
	comment += "## Papers available on Hugging Face Papers:\n\n"
	for r in arxiv_papers:
	hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
	comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
	other_papers = grouped_recommendations.get(False)
	if other_papers:
	comment += "\n\n## Other papers:\n\n"
	for r in other_papers:
	comment += f"* {r['title']} ({r['year']})\n"
	return comment


	def map_repo_name_to_api_key(repo_name: str) -> str:
	return {
	"datasets": "dataset papers",
	"models": "model papers",
	"papers": "papers",
	}[repo_name]


	def get_recommendations_from_slug(
	slug: str, excluded_repo_types: Optional[list[str]] = None
	):
	excluded_repo_types = tuple(excluded_repo_types)
	return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)


	@cached(cache=TTLCache(maxsize=500, ttl=60))
	def _get_recommendations_from_slug(
	slug: str, excluded_repo_types: Optional[tuple[str]] = None
	):
	data = get_arxiv_ids_from_slug(slug)
	if excluded_repo_types:
	excluded_repo_types = list(excluded_repo_types)
	excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
	print(f"excluded_repo_types_remapped={excluded_repo_types}")
	ids = format_ids(data, exclude_keys=excluded_repo_types)
	if not ids:
	return (
	"Based on your collection and exclusions"
	f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
	" removing some excluded repo types or adding more items to your"
	" collection."
	)
	ids = tuple(ids)
	recommendations = get_recommendations_from_semantic_scholar(ids)
	recommendations = recommendations.get("recommendedPapers")
	if recommendations is None:
	raise gr.Error("Something went wrong with the Semantic Scholar API")
	grouped = group_by_is_arxiv_paper(recommendations)
	return format_recommendation_into_markdown(grouped)


	title = """📚 Collections Reading List Generator 📚"""
	description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
	alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">

	\n\n
	Hugging Face Collections allow you to curate models, datasets, spaces,
	and papers from the Hugging Face Hub.
	This Space will generate a reading list based on the items in your collection.
	This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!

	The Space works by:

	- finding any papers in your collection
	- finding papers related to the models and datasets in your collection
	- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.

	You can optionally exclude certain repo types fromm consideration when generating the reading list.
	"""

	slug_input = gr.Textbox(
	lines=1,
	label="Collection Slug",
	placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
	)
	example_slugs = [
	["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
	["osanseviero/model-merging-65097893623330a3a51ead66", []],
	["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
	]

	gr.Interface(
	get_recommendations_from_slug,
	inputs=[
	slug_input,
	gr.Dropdown(
	label="Repos to exclude from contributing to recommendations",
	choices=["datasets", "models", "papers"],
	multiselect=True,
	),
	],
	outputs="markdown",
	description=description,
	title=title,
	allow_flagging="never",
	examples=example_slugs,
	).launch()