Spaces:
Runtime error
Runtime error
Commit
•
ef9cbc8
1
Parent(s):
3772eaf
Fix async bug and add readme
Browse files- app.py +6 -1
- src/readme_update.py +93 -0
- src/utilities.py +5 -5
app.py
CHANGED
@@ -8,6 +8,7 @@ from src.my_logger import setup_logger
|
|
8 |
from src.utilities import load_datasets, merge_and_update_datasets
|
9 |
from src.visualize_logs import log_file_to_html_string
|
10 |
from src.build_nomic import build_nomic
|
|
|
11 |
|
12 |
proj_dir = Path(__name__).parent
|
13 |
|
@@ -96,13 +97,17 @@ async def community(payload: WebhookPayload):
|
|
96 |
logger.info(f"Loaded new dataset")
|
97 |
|
98 |
logger.info(f"Merging and Updating row...")
|
99 |
-
dataset = merge_and_update_datasets(dataset, original_dataset)
|
100 |
|
101 |
# Push the augmented dataset to the Hugging Face hub
|
102 |
logger.info(f"Pushing processed data to the Hugging Face Hub...")
|
103 |
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
|
104 |
logger.info(f"Pushed processed data to the Hugging Face Hub")
|
105 |
|
|
|
|
|
|
|
|
|
106 |
logger.info(f"Building Nomic...")
|
107 |
build_nomic(dataset=dataset)
|
108 |
logger.info(f"Built Nomic")
|
|
|
8 |
from src.utilities import load_datasets, merge_and_update_datasets
|
9 |
from src.visualize_logs import log_file_to_html_string
|
10 |
from src.build_nomic import build_nomic
|
11 |
+
from src.readme_update import update_dataset_readme
|
12 |
|
13 |
proj_dir = Path(__name__).parent
|
14 |
|
|
|
97 |
logger.info(f"Loaded new dataset")
|
98 |
|
99 |
logger.info(f"Merging and Updating row...")
|
100 |
+
dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
|
101 |
|
102 |
# Push the augmented dataset to the Hugging Face hub
|
103 |
logger.info(f"Pushing processed data to the Hugging Face Hub...")
|
104 |
dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
|
105 |
logger.info(f"Pushed processed data to the Hugging Face Hub")
|
106 |
|
107 |
+
update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
|
108 |
+
logger.info(f"Updated README.")
|
109 |
+
|
110 |
+
# Build Nomic
|
111 |
logger.info(f"Building Nomic...")
|
112 |
build_nomic(dataset=dataset)
|
113 |
logger.info(f"Built Nomic")
|
src/readme_update.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
from pathlib import Path
|
4 |
+
from shutil import rmtree
|
5 |
+
|
6 |
+
import pytz
|
7 |
+
from huggingface_hub import HfApi, Repository
|
8 |
+
|
9 |
+
GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---"
|
10 |
+
hf_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
11 |
+
local_repo_path = "./readme_repo"
|
12 |
+
|
13 |
+
|
14 |
+
def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None:
|
15 |
+
"""
|
16 |
+
Update the README file of a specified dataset repository with new information.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
dataset_name (str): Name of the dataset repository.
|
20 |
+
subreddit (str): Name of the subreddit being used for dataset creation.
|
21 |
+
new_rows (int): Number of new rows added in the latest update.
|
22 |
+
hf_token (str): Hugging Face authentication token.
|
23 |
+
local_repo_path (str): Local path to clone the repository.
|
24 |
+
"""
|
25 |
+
# Initialize HfApi
|
26 |
+
api = HfApi()
|
27 |
+
|
28 |
+
if Path(local_repo_path).exists():
|
29 |
+
rmtree(local_repo_path)
|
30 |
+
|
31 |
+
# Clone the repository locally
|
32 |
+
repo = Repository(local_repo_path, clone_from=dataset_name, repo_type='dataset', use_auth_token=hf_token)
|
33 |
+
|
34 |
+
# Read the README file
|
35 |
+
with open(f"{local_repo_path}/README.md", "r") as file:
|
36 |
+
old_readme = file.read()
|
37 |
+
|
38 |
+
# Modify the README
|
39 |
+
new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme)
|
40 |
+
|
41 |
+
# Write the updated README back to the repository
|
42 |
+
with open(f"{local_repo_path}/README.md", "w") as file:
|
43 |
+
file.write(new_readme)
|
44 |
+
|
45 |
+
# Push the changes
|
46 |
+
repo.push_to_hub(blocking=True, commit_message=f'Pushing {new_rows} new rows')
|
47 |
+
|
48 |
+
|
49 |
+
def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str:
|
50 |
+
"""
|
51 |
+
Append new information to the existing README content.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
subreddit (str): Name of the subreddit.
|
55 |
+
new_rows (int): Number of new rows added.
|
56 |
+
old_readme (str): Existing README content.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
str: Updated README content.
|
60 |
+
"""
|
61 |
+
latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0)
|
62 |
+
latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z')
|
63 |
+
|
64 |
+
readme_text = f"""
|
65 |
+
## Dataset Overview
|
66 |
+
This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
|
67 |
+
and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the
|
68 |
+
`content` field.
|
69 |
+
|
70 |
+
The goal is to be able to have an automatic and free semantic/neural tool for any subreddit.
|
71 |
+
|
72 |
+
The last run was on {latest_hour_str} and updated {new_rows}.
|
73 |
+
|
74 |
+
## Creation Details
|
75 |
+
This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates)
|
76 |
+
based on a repository update webhook to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai)
|
77 |
+
visualization.
|
78 |
+
|
79 |
+
## Update Frequency
|
80 |
+
The dataset is updated based on a webhook trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit})
|
81 |
+
is updated, this dataset will be updated.
|
82 |
+
|
83 |
+
## Opt-out
|
84 |
+
To opt-out of this dataset please make a request in the community tab
|
85 |
+
"""
|
86 |
+
|
87 |
+
if GENERATED_BELOW_MARKER in old_readme:
|
88 |
+
index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER)
|
89 |
+
new_readme = old_readme[:index] + "\n\n" + readme_text
|
90 |
+
else:
|
91 |
+
new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n"
|
92 |
+
|
93 |
+
return new_readme
|
src/utilities.py
CHANGED
@@ -15,7 +15,7 @@ PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
|
|
15 |
logger = setup_logger(__name__)
|
16 |
|
17 |
|
18 |
-
|
19 |
# Get latest datasets locally
|
20 |
logger.debug(f"Trying to download {PROCESSED_DATASET}")
|
21 |
dataset = load_dataset(PROCESSED_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
@@ -38,7 +38,7 @@ def merge_and_update_datasets(dataset, original_dataset):
|
|
38 |
# Step 1: Merge df onto odf
|
39 |
# We'll bring in 'content' and 'embedding' from df to compare and possibly update 'embedding'
|
40 |
merged_df = pd.merge(odf, df[['id', 'content', 'embedding']], on='id', how='left', suffixes=('_odf', ''))
|
41 |
-
|
42 |
|
43 |
# Step 2: Compare 'content' from odf and df, update 'embedding' if they differ
|
44 |
merged_df['embedding'] = np.where(merged_df['content_odf'] != merged_df['content'], None, merged_df['embedding'])
|
@@ -48,15 +48,15 @@ def merge_and_update_datasets(dataset, original_dataset):
|
|
48 |
merged_df = merged_df.drop(columns=['content', 'new', 'updated']) # Update columns to match df
|
49 |
merged_df.rename(columns={'content_odf': 'content'}, inplace=True) # Rename 'content_odf' back to 'content'
|
50 |
|
51 |
-
logger.info(f"Updating {
|
52 |
# Iterate over the DataFrame rows where 'embedding' is None
|
53 |
for index, row in merged_df[merged_df['embedding'].isnull()].iterrows():
|
54 |
# Update 'embedding' for the current row using our function
|
55 |
merged_df.at[index, 'embedding'] = update_embeddings(content=row['content'], client=client)
|
56 |
|
57 |
dataset['train'] = Dataset.from_pandas(merged_df)
|
58 |
-
logger.info(f"Updated {
|
59 |
-
return dataset
|
60 |
|
61 |
|
62 |
def update_embeddings(content, client):
|
|
|
15 |
logger = setup_logger(__name__)
|
16 |
|
17 |
|
18 |
+
def load_datasets():
|
19 |
# Get latest datasets locally
|
20 |
logger.debug(f"Trying to download {PROCESSED_DATASET}")
|
21 |
dataset = load_dataset(PROCESSED_DATASET, download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
|
|
38 |
# Step 1: Merge df onto odf
|
39 |
# We'll bring in 'content' and 'embedding' from df to compare and possibly update 'embedding'
|
40 |
merged_df = pd.merge(odf, df[['id', 'content', 'embedding']], on='id', how='left', suffixes=('_odf', ''))
|
41 |
+
updated_row_count = len(merged_df[merged_df.content != merged_df.content_odf])
|
42 |
|
43 |
# Step 2: Compare 'content' from odf and df, update 'embedding' if they differ
|
44 |
merged_df['embedding'] = np.where(merged_df['content_odf'] != merged_df['content'], None, merged_df['embedding'])
|
|
|
48 |
merged_df = merged_df.drop(columns=['content', 'new', 'updated']) # Update columns to match df
|
49 |
merged_df.rename(columns={'content_odf': 'content'}, inplace=True) # Rename 'content_odf' back to 'content'
|
50 |
|
51 |
+
logger.info(f"Updating {updated_row_count} rows...")
|
52 |
# Iterate over the DataFrame rows where 'embedding' is None
|
53 |
for index, row in merged_df[merged_df['embedding'].isnull()].iterrows():
|
54 |
# Update 'embedding' for the current row using our function
|
55 |
merged_df.at[index, 'embedding'] = update_embeddings(content=row['content'], client=client)
|
56 |
|
57 |
dataset['train'] = Dataset.from_pandas(merged_df)
|
58 |
+
logger.info(f"Updated {updated_row_count} rows")
|
59 |
+
return dataset, updated_row_count
|
60 |
|
61 |
|
62 |
def update_embeddings(content, client):
|