Spaces:
Runtime error
Runtime error
File size: 3,764 Bytes
ef9cbc8 9930cd7 ef9cbc8 9930cd7 ef9cbc8 9930cd7 ef9cbc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pytz
from huggingface_hub import HfApi, Repository
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
local_repo_path = "./readme_repo"
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
"""
Update the README file of a specified dataset repository with new information.
Args:
dataset_name (str): Name of the dataset repository.
subreddit (str): Name of the subreddit being used for dataset creation.
new_rows (int): Number of new rows added in the latest update.
hf_token (str): Hugging Face authentication token.
local_repo_path (str): Local path to clone the repository.
"""
# Initialize HfApi
api = HfApi()
if Path(local_repo_path).exists():
rmtree(local_repo_path)
# Clone the repository locally
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
# Read the README file
with open(f"{local_repo_path}/README.md", "r") as file:
old_readme = file.read()
# Modify the README
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
# Write the updated README back to the repository
with open(f"{local_repo_path}/README.md", "w") as file:
file.write(new_readme)
# Push the changes
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
"""
Append new information to the existing README content.
Args:
subreddit (str): Name of the subreddit.
new_rows (int): Number of new rows added.
old_readme (str): Existing README content.
Returns:
str: Updated README content.
"""
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
readme_text = f"""
## Dataset Overview
This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
`content` field.
The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.
The last run was on {latest_hour_str} and updated {new_rows} new rows.
## Creation Details
This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates)
based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai)
visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates).
## Update Frequency
The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
is updated, this dataset will be updated.
## Opt-out
To opt-out of this dataset please make a request in the community tab
"""
if GENERATED_BELOW_MARKER in old_readme:
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
new_readme = old_readme[:index] + "\n\n" + readme_text
else:
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
return new_readme
|