Spaces:

reddit-tools-HF
/

dataset-creator-reddit-bestofredditorupdates

Running

dataset-creator-reddit-bestofredditorupdates

File size: 3,484 Bytes

ed3130d
5d9e0b8
41daa3d
 
ed3130d
5d9e0b8
5ec6657
ed3130d
5d9e0b8
5ec6657
3e1e25d
47ad458
5ec6657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9188762
 
41daa3d
9188762
 
5ec6657
 
9188762
5ec6657
 
9188762
 
 
 
 
 
 
 
41daa3d
5ec6657
 
 
 
 
 
 
 
 
 
 
 
 
5d9e0b8
 
 
ed3130d
285612d
5ec6657
285612d
5d9e0b8
285612d
613d6f5
3e1e25d
613d6f5
7641c8b
5d9e0b8
bc7f4d5
 
 
5ec6657
5d9e0b8
 
a9b0348
 
 
 
 
 
285612d
ed3130d
5ec6657
 
 
ed3130d
5ec6657
ed3130d
41daa3d

import os
from datetime import datetime
from pathlib import Path
from shutil import rmtree

import pytz
from huggingface_hub import HfApi, Repository

frequency = os.environ.get("FREQUENCY", '').lower()
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
username = os.environ["USERNAME"]
hf_token = os.environ["HF_TOKEN"]
local_repo_path = "./readme_repo"


def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
    """
    Update the README file of a specified dataset repository with new information.

    Args:
    dataset_name (str): Name of the dataset repository.
    subreddit (str): Name of the subreddit being used for dataset creation.
    new_rows (int): Number of new rows added in the latest update.
    hf_token (str): Hugging Face authentication token.
    local_repo_path (str): Local path to clone the repository.
    """
    # Initialize HfApi
    api = HfApi()

    # Download README file
    readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md")

    # Read it
    with open(readme_path, "r") as file:
        old_readme = file.read()

    # Modify it
    new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)

    # Commit modifications
    api.upload_file(
        path_or_fileobj=new_readme.encode(),
        path_in_repo="README.md",
        repo_id=dataset_name,
        repo_type="dataset",
        commit_message=f'Pushing {new_rows} new rows'
    )


def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
    """
    Append new information to the existing README content.

    Args:
    subreddit (str): Name of the subreddit.
    new_rows (int): Number of new rows added.
    old_readme (str): Existing README content.

    Returns:
    str: Updated README content.
    """
    latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
    latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')

    readme_text = f"""
## Dataset Overview
The goal is to have an open dataset of [r/{subreddit}](https://www.reddit.com/r/{subreddit}/) submissions. I'm leveraging PRAW and the Reddit API to get downloads.

There is a limit of 1000 in an API call and limited search functionality, so this is run {frequency} to get new submissions.

## Creation Details
This dataset was created by [{username}/dataset-creator-reddit-{subreddit}](https://huggingface.co/spaces/{username}/dataset-creator-reddit-{subreddit})

## Update Frequency
The dataset is updated {frequency} with the most recent update being `{latest_hour_str}` where we added **{new_rows} new rows**.

## Licensing 
[Reddit Licensing terms](https://www.redditinc.com/policies/data-api-terms) as accessed on October 25:
[License information]

## Opt-out
To opt-out of this dataset please make a pull request with your justification and add your ids in filter_ids.json

1. Go to [filter_ids.json](https://huggingface.co/spaces/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates/blob/main/filter_ids.json)
2. Click Edit
3. Add your ids, 1 per row
4. Comment with your justification
"""

    if GENERATED_BELOW_MARKER in old_readme:
        index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
        new_readme = old_readme[:index] + "\n\n" + readme_text
    else:
        new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"

    return new_readme