derek-thomas's picture
derek-thomas HF staff
Adding readme updates and removing debug from logs
9930cd7
raw
history blame
3.76 kB
import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pytz
from huggingface_hub import HfApi, Repository
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
local_repo_path = "./readme_repo"
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
"""
Update the README file of a specified dataset repository with new information.
Args:
dataset_name (str): Name of the dataset repository.
subreddit (str): Name of the subreddit being used for dataset creation.
new_rows (int): Number of new rows added in the latest update.
hf_token (str): Hugging Face authentication token.
local_repo_path (str): Local path to clone the repository.
"""
# Initialize HfApi
api = HfApi()
if Path(local_repo_path).exists():
rmtree(local_repo_path)
# Clone the repository locally
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
# Read the README file
with open(f"{local_repo_path}/README.md", "r") as file:
old_readme = file.read()
# Modify the README
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
# Write the updated README back to the repository
with open(f"{local_repo_path}/README.md", "w") as file:
file.write(new_readme)
# Push the changes
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
"""
Append new information to the existing README content.
Args:
subreddit (str): Name of the subreddit.
new_rows (int): Number of new rows added.
old_readme (str): Existing README content.
Returns:
str: Updated README content.
"""
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
readme_text = f"""
## Dataset Overview
This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
`content` field.
The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.
The last run was on {latest_hour_str} and updated {new_rows} new rows.
## Creation Details
This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates)
based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai)
visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates).
## Update Frequency
The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
is updated, this dataset will be updated.
## Opt-out
To opt-out of this dataset please make a request in the community tab
"""
if GENERATED_BELOW_MARKER in old_readme:
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
new_readme = old_readme[:index] + "\n\n" + readme_text
else:
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
return new_readme