import os from datetime import datetime from pathlib import Path from shutil import rmtree import pytz from huggingface_hub import HfApi, Repository frequency = os.environ.get("FREQUENCY", '').lower() GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---" username = os.environ["USERNAME"] hf_token = os.environ["HF_TOKEN"] local_repo_path = "./readme_repo" def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None: """ Update the README file of a specified dataset repository with new information. Args: dataset_name (str): Name of the dataset repository. subreddit (str): Name of the subreddit being used for dataset creation. new_rows (int): Number of new rows added in the latest update. hf_token (str): Hugging Face authentication token. local_repo_path (str): Local path to clone the repository. """ # Initialize HfApi api = HfApi() # Download README file readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md") # Read it with open(readme_path, "r") as file: old_readme = file.read() # Modify it new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme) # Commit modifications api.upload_file( path_or_fileobj=new_readme.encode(), path_in_repo="README.md", repo_id=dataset_name, repo_type="dataset", commit_message=f'Pushing {new_rows} new rows' ) def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str: """ Append new information to the existing README content. Args: subreddit (str): Name of the subreddit. new_rows (int): Number of new rows added. old_readme (str): Existing README content. Returns: str: Updated README content. """ latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0) latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z') readme_text = f""" ## Dataset Overview The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads. There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions. ## Creation Details This dataset was created by [{username}/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/{username}/dataset-creator-reddit-{subreddit}) ## Update Frequency The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**. ## Licensing [Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25: [License information] ## Opt-out To opt-out of this dataset please make a pull request with your justification and add your ids in filter_ids.json 1. Go to [filter_ids.json](https://huggingface.co/spaces/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates/blob/main/filter_ids.json) 2. Click Edit 3. Add your ids, 1 per row 4. Comment with your justification """ if GENERATED_BELOW_MARKER in old_readme: index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER) new_readme = old_readme[:index] + "\n\n" + readme_text else: new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n" return new_readme