Commit
•
285612d
1
Parent(s):
1d46c26
Major updates, moving away from pushshift.io into PRAW
Browse files- .gitignore +2 -1
- Dockerfile +6 -2
- app.py +2 -2
- archive/subreddit_downloader.py +0 -145
- main.py +40 -94
- media/reddit_scraper.drawio.html +0 -11
- media/reddit_scraper.drawio.png +0 -0
- notebooks/data_processing.ipynb +0 -0
- notebooks/explore.ipynb +0 -323
- notebooks/validate.ipynb +0 -617
- requirements.txt +5 -5
- utilities/data_collator.py +55 -0
- utilities/my_logger.py +22 -0
- utilities/praw_downloader.py +54 -0
- utilities/praw_processor.py +35 -0
- utilities/readme_update.py +8 -12
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
.idea/
|
2 |
notebooks/.ipynb_checkpoints
|
3 |
-
mylog.log
|
|
|
|
1 |
.idea/
|
2 |
notebooks/.ipynb_checkpoints
|
3 |
+
mylog.log
|
4 |
+
.env
|
Dockerfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
# Use the official Python base image
|
2 |
-
FROM python:3.
|
3 |
|
4 |
# Install Git LFS
|
5 |
-
RUN
|
|
|
|
|
6 |
|
7 |
# https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
|
8 |
RUN useradd -m -u 1000 user
|
@@ -29,7 +31,9 @@ COPY . .
|
|
29 |
COPY supervisord.conf .
|
30 |
|
31 |
# Set permissions on the log file
|
|
|
32 |
RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
|
|
|
33 |
# RUN mkdir -m 777 -p /.cache/huggingface/hub/
|
34 |
|
35 |
|
|
|
1 |
# Use the official Python base image
|
2 |
+
FROM python:3.10
|
3 |
|
4 |
# Install Git LFS
|
5 |
+
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
6 |
+
RUN apt-get -o Acquire::AllowInsecureRepositories=true update && apt-get install -y git-lfs
|
7 |
+
#RUN apt-get update && apt-get install -y git-lfs
|
8 |
|
9 |
# https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
|
10 |
RUN useradd -m -u 1000 user
|
|
|
31 |
COPY supervisord.conf .
|
32 |
|
33 |
# Set permissions on the log file
|
34 |
+
USER root
|
35 |
RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
|
36 |
+
USER user
|
37 |
# RUN mkdir -m 777 -p /.cache/huggingface/hub/
|
38 |
|
39 |
|
app.py
CHANGED
@@ -9,7 +9,7 @@ proj_dir = Path(__name__).parent
|
|
9 |
|
10 |
subreddit = os.environ["SUBREDDIT"]
|
11 |
username = os.environ["USERNAME"]
|
12 |
-
dataset_name = f"{username}/dataset-creator-{subreddit}"
|
13 |
|
14 |
|
15 |
def log_file_to_html_string():
|
@@ -37,7 +37,7 @@ markdown = f"""
|
|
37 |
# Reddit Scraper
|
38 |
This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
|
39 |
|
40 |
-
As shown below this space pulls data from
|
41 |
"""
|
42 |
|
43 |
with gr.Blocks() as demo:
|
|
|
9 |
|
10 |
subreddit = os.environ["SUBREDDIT"]
|
11 |
username = os.environ["USERNAME"]
|
12 |
+
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
13 |
|
14 |
|
15 |
def log_file_to_html_string():
|
|
|
37 |
# Reddit Scraper
|
38 |
This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
|
39 |
|
40 |
+
As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
|
41 |
"""
|
42 |
|
43 |
with gr.Blocks() as demo:
|
archive/subreddit_downloader.py
DELETED
@@ -1,145 +0,0 @@
|
|
1 |
-
import csv
|
2 |
-
import json
|
3 |
-
import sys
|
4 |
-
import time
|
5 |
-
import traceback
|
6 |
-
from datetime import datetime
|
7 |
-
|
8 |
-
import requests
|
9 |
-
|
10 |
-
username = "" # put the username you want to download in the quotes
|
11 |
-
subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
|
12 |
-
thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
|
13 |
-
# leave either one blank to download an entire user's or subreddit's history
|
14 |
-
# or fill in both to download a specific users history from a specific subreddit
|
15 |
-
|
16 |
-
# change this to one of "human", "csv" or "json"
|
17 |
-
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
|
18 |
-
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
|
19 |
-
# - json: the full json object
|
20 |
-
output_format = "csv"
|
21 |
-
|
22 |
-
# default start time is the current time and default end time is all history
|
23 |
-
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
|
24 |
-
# start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
|
25 |
-
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
|
26 |
-
end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
|
27 |
-
|
28 |
-
convert_to_ascii = False # don't touch this unless you know what you're doing
|
29 |
-
convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
|
30 |
-
|
31 |
-
|
32 |
-
def write_csv_line(writer, obj, is_submission):
|
33 |
-
output_list = []
|
34 |
-
output_list.append(str(obj['score']))
|
35 |
-
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
|
36 |
-
if is_submission:
|
37 |
-
output_list.append(obj['title'])
|
38 |
-
output_list.append(f"u/{obj['author']}")
|
39 |
-
output_list.append(f"https://www.reddit.com{obj['permalink']}")
|
40 |
-
if is_submission:
|
41 |
-
if obj['is_self']:
|
42 |
-
if 'selftext' in obj:
|
43 |
-
output_list.append(obj['selftext'])
|
44 |
-
else:
|
45 |
-
output_list.append("")
|
46 |
-
else:
|
47 |
-
output_list.append(obj['url'])
|
48 |
-
else:
|
49 |
-
output_list.append(obj['body'])
|
50 |
-
writer.writerow(output_list)
|
51 |
-
|
52 |
-
|
53 |
-
def write_json_line(handle, obj):
|
54 |
-
handle.write(json.dumps(obj))
|
55 |
-
handle.write("\n")
|
56 |
-
|
57 |
-
|
58 |
-
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
|
59 |
-
print(f"Saving to {filename}")
|
60 |
-
|
61 |
-
count = 0
|
62 |
-
if output_format == "human" or output_format == "json":
|
63 |
-
if convert_to_ascii:
|
64 |
-
handle = open(filename, 'w', encoding='ascii')
|
65 |
-
else:
|
66 |
-
handle = open(filename, 'w', encoding='UTF-8')
|
67 |
-
else:
|
68 |
-
handle = open(filename, 'w', encoding='UTF-8', newline='')
|
69 |
-
writer = csv.writer(handle)
|
70 |
-
|
71 |
-
previous_epoch = int(start_datetime.timestamp())
|
72 |
-
break_out = False
|
73 |
-
while True:
|
74 |
-
new_url = url_base + str(previous_epoch)
|
75 |
-
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
|
76 |
-
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
|
77 |
-
try:
|
78 |
-
json_data = json_text.json()
|
79 |
-
except json.decoder.JSONDecodeError:
|
80 |
-
time.sleep(1)
|
81 |
-
continue
|
82 |
-
|
83 |
-
if 'data' not in json_data:
|
84 |
-
break
|
85 |
-
objects = json_data['data']
|
86 |
-
if len(objects) == 0:
|
87 |
-
break
|
88 |
-
|
89 |
-
for obj in objects:
|
90 |
-
previous_epoch = obj['created_utc'] - 1
|
91 |
-
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
|
92 |
-
break_out = True
|
93 |
-
break
|
94 |
-
count += 1
|
95 |
-
try:
|
96 |
-
if output_format == "csv":
|
97 |
-
write_csv_line(writer, obj, is_submission)
|
98 |
-
elif output_format == "json":
|
99 |
-
write_json_line(handle, obj)
|
100 |
-
except Exception as err:
|
101 |
-
if 'permalink' in obj:
|
102 |
-
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
|
103 |
-
else:
|
104 |
-
print(f"Couldn't print object, missing permalink: {obj['id']}")
|
105 |
-
print(err)
|
106 |
-
print(traceback.format_exc())
|
107 |
-
|
108 |
-
if break_out:
|
109 |
-
break
|
110 |
-
|
111 |
-
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
|
112 |
-
|
113 |
-
print(f"Saved {count}")
|
114 |
-
handle.close()
|
115 |
-
|
116 |
-
|
117 |
-
if __name__ == "__main__":
|
118 |
-
filter_string = None
|
119 |
-
if username == "" and subreddit == "" and thread_id == "":
|
120 |
-
print("Fill in username, subreddit or thread id")
|
121 |
-
sys.exit(0)
|
122 |
-
if output_format not in ("human", "csv", "json"):
|
123 |
-
print("Output format must be one of human, csv, json")
|
124 |
-
sys.exit(0)
|
125 |
-
|
126 |
-
filters = []
|
127 |
-
if username:
|
128 |
-
filters.append(f"author={username}")
|
129 |
-
if subreddit:
|
130 |
-
filters.append(f"subreddit={subreddit}")
|
131 |
-
if thread_id:
|
132 |
-
if convert_thread_id_to_base_ten:
|
133 |
-
filters.append(f"link_id={int(thread_id, 36)}")
|
134 |
-
else:
|
135 |
-
filters.append(f"link_id=t3_{thread_id}")
|
136 |
-
filter_string = '&'.join(filters)
|
137 |
-
|
138 |
-
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
|
139 |
-
|
140 |
-
if not thread_id:
|
141 |
-
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
|
142 |
-
end_time, True, convert_to_ascii)
|
143 |
-
# download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
|
144 |
-
# end_time, False, convert_to_ascii)
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -3,17 +3,18 @@ import time
|
|
3 |
from datetime import datetime, timedelta
|
4 |
|
5 |
import pandas as pd
|
6 |
-
|
|
|
7 |
from huggingface_hub import login
|
8 |
|
9 |
-
from
|
10 |
-
from utilities.
|
11 |
from utilities.readme_update import update_readme
|
12 |
|
13 |
# Set dataset name, path to README.md, and existing dataset details
|
14 |
subreddit = os.environ["SUBREDDIT"]
|
15 |
username = os.environ["USERNAME"]
|
16 |
-
dataset_name = f"{username}/dataset-creator-{subreddit}"
|
17 |
dataset_readme_path = "README.md"
|
18 |
|
19 |
# Authenticate with Hugging Face using an auth token
|
@@ -23,94 +24,6 @@ login(auth_token, add_to_git_credential=True)
|
|
23 |
logger = setup_logger(__name__)
|
24 |
|
25 |
|
26 |
-
def main(dataset, date_to_fetch):
|
27 |
-
"""
|
28 |
-
Runs the main data processing function to fetch and process subreddit data for the specified date.
|
29 |
-
|
30 |
-
Args:
|
31 |
-
dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
|
32 |
-
date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
most_recent_date (str): The most recent date in the updated dataset.
|
36 |
-
"""
|
37 |
-
# Call get_subreddit_day with the calculated date
|
38 |
-
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
39 |
-
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
40 |
-
df = submissions_to_dataframe(submissions)
|
41 |
-
logger.debug(f"Data fetched for {str(date_to_fetch)}")
|
42 |
-
most_recent_date = date_to_fetch
|
43 |
-
|
44 |
-
# Append DataFrame to split 'all_days' or create new split
|
45 |
-
if "all_days" in dataset:
|
46 |
-
logger.debug("Appending data to split 'all_days'")
|
47 |
-
# Merge the new submissions
|
48 |
-
old_data = dataset['all_days'].to_pandas()
|
49 |
-
new_data = pd.concat([old_data, df], ignore_index=True)
|
50 |
-
if '__index_level_0__' in new_data.columns:
|
51 |
-
new_data = new_data.drop('__index_level_0__', axis=1)
|
52 |
-
|
53 |
-
# Drop duplicates just in case
|
54 |
-
new_data = new_data.drop_duplicates(subset=['id'], keep="first")
|
55 |
-
|
56 |
-
# Figure out dates when we restart
|
57 |
-
old_data_most_recent_date = old_data['date'].max()
|
58 |
-
old_data_most_recent_date = datetime.strptime(old_data_most_recent_date, '%Y-%m-%d').date()
|
59 |
-
most_recent_date = max(old_data_most_recent_date, most_recent_date)
|
60 |
-
|
61 |
-
if len(old_data) == len(new_data):
|
62 |
-
logger.warning("Data in hub is much more recent, using that next!")
|
63 |
-
return most_recent_date
|
64 |
-
|
65 |
-
# Convert back to dataset
|
66 |
-
dataset["all_days"] = Dataset.from_pandas(new_data)
|
67 |
-
|
68 |
-
# Update README
|
69 |
-
update_readme(dataset_name, subreddit, date_to_fetch)
|
70 |
-
else:
|
71 |
-
logger.debug("Creating new split 'all_days'")
|
72 |
-
dataset["all_days"] = Dataset.from_pandas(df)
|
73 |
-
# Log appending or creating split 'all'
|
74 |
-
logger.debug("Appended or created split 'all_days'")
|
75 |
-
|
76 |
-
# Push the augmented dataset to the Hugging Face hub
|
77 |
-
logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
|
78 |
-
dataset.push_to_hub(dataset_name, token=auth_token)
|
79 |
-
logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
|
80 |
-
return most_recent_date
|
81 |
-
|
82 |
-
|
83 |
-
def run_main_continuously():
|
84 |
-
"""
|
85 |
-
This function runs the given `main_function` continuously, starting from the date specified
|
86 |
-
in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
|
87 |
-
it will wait until tomorrow to start again at the same time as when it started today.
|
88 |
-
"""
|
89 |
-
start_date_str = os.environ.get("START_DATE")
|
90 |
-
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
|
91 |
-
|
92 |
-
# Calculate the start time for running the main_function every day.
|
93 |
-
start_time = datetime.now().time()
|
94 |
-
|
95 |
-
dataset = get_dataset()
|
96 |
-
|
97 |
-
while True:
|
98 |
-
today = datetime.now().date()
|
99 |
-
two_days_ago = today - timedelta(days=2)
|
100 |
-
|
101 |
-
if start_date <= two_days_ago:
|
102 |
-
logger.warning(f"Running main function for date: {start_date}")
|
103 |
-
most_recent_date = main(dataset, start_date)
|
104 |
-
start_date = most_recent_date + timedelta(days=1)
|
105 |
-
else:
|
106 |
-
tomorrow = today + timedelta(days=1)
|
107 |
-
now = datetime.now()
|
108 |
-
start_of_tomorrow = datetime.combine(tomorrow, start_time)
|
109 |
-
wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
|
110 |
-
logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
|
111 |
-
time.sleep(wait_until_tomorrow)
|
112 |
-
|
113 |
-
|
114 |
def get_dataset():
|
115 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
116 |
try:
|
@@ -124,5 +37,38 @@ def get_dataset():
|
|
124 |
return dataset
|
125 |
|
126 |
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from datetime import datetime, timedelta
|
4 |
|
5 |
import pandas as pd
|
6 |
+
import schedule
|
7 |
+
from datasets import DatasetDict, load_dataset, Dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
+
from utilities.data_collator import merge_and_filter_data
|
11 |
+
from utilities.my_logger import setup_logger
|
12 |
from utilities.readme_update import update_readme
|
13 |
|
14 |
# Set dataset name, path to README.md, and existing dataset details
|
15 |
subreddit = os.environ["SUBREDDIT"]
|
16 |
username = os.environ["USERNAME"]
|
17 |
+
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
18 |
dataset_readme_path = "README.md"
|
19 |
|
20 |
# Authenticate with Hugging Face using an auth token
|
|
|
24 |
logger = setup_logger(__name__)
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def get_dataset():
|
28 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
29 |
try:
|
|
|
37 |
return dataset
|
38 |
|
39 |
|
40 |
+
def main():
|
41 |
+
date = datetime.now().strftime('%Y-%m-%d')
|
42 |
+
logger.warning(f"Running main function for date: {date}")
|
43 |
+
dataset = get_dataset()
|
44 |
+
|
45 |
+
# Get Latest Data and merge with historic data
|
46 |
+
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
47 |
+
new_df = merge_and_filter_data(old_df=old_df)
|
48 |
+
dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
|
49 |
+
|
50 |
+
# Update README
|
51 |
+
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
|
52 |
+
|
53 |
+
# Push the augmented dataset to the Hugging Face hub
|
54 |
+
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
55 |
+
dataset.push_to_hub(dataset_name, token=auth_token)
|
56 |
+
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
57 |
+
|
58 |
+
|
59 |
+
def schedule_daily_task():
|
60 |
+
"""
|
61 |
+
Schedule the daily_task to run at the specific time every day.
|
62 |
+
"""
|
63 |
+
start_time = (datetime.now() + timedelta(seconds=5)).time().strftime('%H:%M') # Now + 30 seconds
|
64 |
+
logger.info(f'Scheduling tasks to run every day at: {start_time}')
|
65 |
+
main()
|
66 |
+
schedule.every().day.at(start_time).do(main)
|
67 |
+
|
68 |
+
while True:
|
69 |
+
schedule.run_pending()
|
70 |
+
time.sleep(1)
|
71 |
+
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
schedule_daily_task()
|
media/reddit_scraper.drawio.html
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
<!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
|
2 |
-
<!DOCTYPE html>
|
3 |
-
<html>
|
4 |
-
<head>
|
5 |
-
<title>reddit_scraper</title>
|
6 |
-
<meta charset="utf-8"/>
|
7 |
-
</head>
|
8 |
-
<body><div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{"highlight":"#0000ff","nav":true,"resize":true,"toolbar":"zoom layers tags lightbox","edit":"_blank","xml":"<mxfile host=\"app.diagrams.net\" modified=\"2023-04-14T12:12:14.014Z\" agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\" etag=\"puEjOIZigDmpONhGThsE\" version=\"21.1.7\" type=\"device\">\n <diagram name=\"Page-1\" id=\"14ddc1Tw5ZQC4xUkB2ri\">\n <mxGraphModel dx=\"1034\" dy=\"783\" grid=\"1\" gridSize=\"10\" guides=\"1\" tooltips=\"1\" connect=\"1\" arrows=\"1\" fold=\"1\" page=\"1\" pageScale=\"1\" pageWidth=\"850\" pageHeight=\"1100\" math=\"0\" shadow=\"0\">\n <root>\n <mxCell id=\"0\" />\n <mxCell id=\"1\" parent=\"0\" />\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-3\" value=\"\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;\" edge=\"1\" parent=\"1\" source=\"KhBTRBst3V2Bs5u7l5Na-1\" target=\"KhBTRBst3V2Bs5u7l5Na-2\">\n <mxGeometry relative=\"1\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-7\" value=\"HF API\" style=\"edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\" vertex=\"1\" connectable=\"0\" parent=\"KhBTRBst3V2Bs5u7l5Na-3\">\n <mxGeometry x=\"-0.125\" y=\"1\" relative=\"1\" as=\"geometry\">\n <mxPoint as=\"offset\" />\n </mxGeometry>\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-1\" value=\"HF SPACE&lt;br&gt;&lt;a href=&quot;SPACE_LINK&quot;&gt;SPACE_NAME&lt;/a&gt;\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"340\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-2\" value=\"HF DATASET &lt;br&gt;&lt;a href=&quot;DATASET_LINK&quot;&gt;DATASET_NAME&lt;/a&gt;\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"110\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-4\" value=\"&lt;a href=&quot;pushshift.io&quot;&gt;Pushshift.io&lt;/a&gt;&lt;br&gt;Hosts Reddit Data\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"590\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-8\" value=\"\" style=\"endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;\" edge=\"1\" parent=\"1\" source=\"KhBTRBst3V2Bs5u7l5Na-1\" target=\"KhBTRBst3V2Bs5u7l5Na-4\">\n <mxGeometry width=\"50\" height=\"50\" relative=\"1\" as=\"geometry\">\n <mxPoint x=\"470\" y=\"530\" as=\"sourcePoint\" />\n <mxPoint x=\"520\" y=\"480\" as=\"targetPoint\" />\n </mxGeometry>\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-9\" value=\"HTTP\" style=\"edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\" vertex=\"1\" connectable=\"0\" parent=\"KhBTRBst3V2Bs5u7l5Na-8\">\n <mxGeometry x=\"0.225\" y=\"1\" relative=\"1\" as=\"geometry\">\n <mxPoint x=\"-9\" y=\"1\" as=\"offset\" />\n </mxGeometry>\n </mxCell>\n </root>\n </mxGraphModel>\n </diagram>\n</mxfile>\n"}"></div>
|
9 |
-
<script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>
|
10 |
-
</body>
|
11 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
media/reddit_scraper.drawio.png
CHANGED
notebooks/data_processing.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/explore.ipynb
DELETED
@@ -1,323 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": null,
|
6 |
-
"id": "730ba509",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"from IPython.core.interactiveshell import InteractiveShell\n",
|
11 |
-
"InteractiveShell.ast_node_interactivity = \"all\""
|
12 |
-
]
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"cell_type": "code",
|
16 |
-
"execution_count": null,
|
17 |
-
"id": "d9acd4b6",
|
18 |
-
"metadata": {},
|
19 |
-
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"from pathlib import Path\n",
|
22 |
-
"import sys\n",
|
23 |
-
"proj_dir = Path.cwd().parent\n",
|
24 |
-
"\n",
|
25 |
-
"sys.path.append(str(proj_dir))\n"
|
26 |
-
]
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"cell_type": "code",
|
30 |
-
"execution_count": null,
|
31 |
-
"id": "62452860",
|
32 |
-
"metadata": {},
|
33 |
-
"outputs": [],
|
34 |
-
"source": [
|
35 |
-
"from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
|
36 |
-
]
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"cell_type": "code",
|
40 |
-
"execution_count": 4,
|
41 |
-
"id": "a956a623",
|
42 |
-
"metadata": {},
|
43 |
-
"outputs": [
|
44 |
-
{
|
45 |
-
"data": {
|
46 |
-
"application/vnd.jupyter.widget-view+json": {
|
47 |
-
"model_id": "17df3f2812084d3591e914ffcfd948b0",
|
48 |
-
"version_major": 2,
|
49 |
-
"version_minor": 0
|
50 |
-
},
|
51 |
-
"text/plain": [
|
52 |
-
"0it [00:00, ?it/s]"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
"metadata": {},
|
56 |
-
"output_type": "display_data"
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"name": "stderr",
|
60 |
-
"output_type": "stream",
|
61 |
-
"text": [
|
62 |
-
"2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
|
63 |
-
"2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
|
64 |
-
"2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
|
65 |
-
"2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
|
66 |
-
"2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
|
67 |
-
"2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
|
68 |
-
"2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
|
69 |
-
]
|
70 |
-
}
|
71 |
-
],
|
72 |
-
"source": [
|
73 |
-
"subreddit_to_scrape = \"askreddit\"\n",
|
74 |
-
"day_to_scrape = \"2013-03-01\"\n",
|
75 |
-
"submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
|
76 |
-
]
|
77 |
-
},
|
78 |
-
{
|
79 |
-
"cell_type": "code",
|
80 |
-
"execution_count": 5,
|
81 |
-
"id": "b1cc845b",
|
82 |
-
"metadata": {},
|
83 |
-
"outputs": [
|
84 |
-
{
|
85 |
-
"data": {
|
86 |
-
"text/html": [
|
87 |
-
"<div>\n",
|
88 |
-
"<style scoped>\n",
|
89 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
90 |
-
" vertical-align: middle;\n",
|
91 |
-
" }\n",
|
92 |
-
"\n",
|
93 |
-
" .dataframe tbody tr th {\n",
|
94 |
-
" vertical-align: top;\n",
|
95 |
-
" }\n",
|
96 |
-
"\n",
|
97 |
-
" .dataframe thead th {\n",
|
98 |
-
" text-align: right;\n",
|
99 |
-
" }\n",
|
100 |
-
"</style>\n",
|
101 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
102 |
-
" <thead>\n",
|
103 |
-
" <tr style=\"text-align: right;\">\n",
|
104 |
-
" <th></th>\n",
|
105 |
-
" <th>permalink</th>\n",
|
106 |
-
" <th>selftext</th>\n",
|
107 |
-
" <th>url</th>\n",
|
108 |
-
" <th>created_utc</th>\n",
|
109 |
-
" <th>author</th>\n",
|
110 |
-
" <th>num_comments</th>\n",
|
111 |
-
" <th>score</th>\n",
|
112 |
-
" <th>title</th>\n",
|
113 |
-
" <th>id</th>\n",
|
114 |
-
" <th>downs</th>\n",
|
115 |
-
" <th>ups</th>\n",
|
116 |
-
" </tr>\n",
|
117 |
-
" </thead>\n",
|
118 |
-
" <tbody>\n",
|
119 |
-
" <tr>\n",
|
120 |
-
" <th>0</th>\n",
|
121 |
-
" <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
|
122 |
-
" <td>Basically, do other parts of the world have th...</td>\n",
|
123 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
124 |
-
" <td>2013-03-01 19:58:55</td>\n",
|
125 |
-
" <td>sjr63</td>\n",
|
126 |
-
" <td>1</td>\n",
|
127 |
-
" <td>1</td>\n",
|
128 |
-
" <td>In the way that popular English and American m...</td>\n",
|
129 |
-
" <td>19hbm0</td>\n",
|
130 |
-
" <td>0</td>\n",
|
131 |
-
" <td>1</td>\n",
|
132 |
-
" </tr>\n",
|
133 |
-
" <tr>\n",
|
134 |
-
" <th>1</th>\n",
|
135 |
-
" <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
|
136 |
-
" <td></td>\n",
|
137 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
138 |
-
" <td>2013-03-01 19:58:50</td>\n",
|
139 |
-
" <td>WeirdPlane</td>\n",
|
140 |
-
" <td>13</td>\n",
|
141 |
-
" <td>1</td>\n",
|
142 |
-
" <td>Could I buy an Android phone without a plan an...</td>\n",
|
143 |
-
" <td>19hblp</td>\n",
|
144 |
-
" <td>0</td>\n",
|
145 |
-
" <td>1</td>\n",
|
146 |
-
" </tr>\n",
|
147 |
-
" <tr>\n",
|
148 |
-
" <th>2</th>\n",
|
149 |
-
" <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
|
150 |
-
" <td>Yeah.\n",
|
151 |
-
"\n",
|
152 |
-
"How do I reddit? I don't use or read re...</td>\n",
|
153 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
154 |
-
" <td>2013-03-01 19:58:47</td>\n",
|
155 |
-
" <td>xxnovaroxgg</td>\n",
|
156 |
-
" <td>14</td>\n",
|
157 |
-
" <td>0</td>\n",
|
158 |
-
" <td>How do I reddit</td>\n",
|
159 |
-
" <td>19hblj</td>\n",
|
160 |
-
" <td>0</td>\n",
|
161 |
-
" <td>0</td>\n",
|
162 |
-
" </tr>\n",
|
163 |
-
" <tr>\n",
|
164 |
-
" <th>3</th>\n",
|
165 |
-
" <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
|
166 |
-
" <td>My brothers, dad and I have always been huge L...</td>\n",
|
167 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
168 |
-
" <td>2013-03-01 19:58:07</td>\n",
|
169 |
-
" <td>tuffstough</td>\n",
|
170 |
-
" <td>0</td>\n",
|
171 |
-
" <td>1</td>\n",
|
172 |
-
" <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
|
173 |
-
" <td>19hbjx</td>\n",
|
174 |
-
" <td>0</td>\n",
|
175 |
-
" <td>1</td>\n",
|
176 |
-
" </tr>\n",
|
177 |
-
" <tr>\n",
|
178 |
-
" <th>4</th>\n",
|
179 |
-
" <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
|
180 |
-
" <td>I'm curious, guys tend to get asked the usual ...</td>\n",
|
181 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
182 |
-
" <td>2013-03-01 19:57:58</td>\n",
|
183 |
-
" <td>redditredditx3</td>\n",
|
184 |
-
" <td>13</td>\n",
|
185 |
-
" <td>2</td>\n",
|
186 |
-
" <td>Female Redditors, which part of the male physi...</td>\n",
|
187 |
-
" <td>19hbjk</td>\n",
|
188 |
-
" <td>0</td>\n",
|
189 |
-
" <td>2</td>\n",
|
190 |
-
" </tr>\n",
|
191 |
-
" </tbody>\n",
|
192 |
-
"</table>\n",
|
193 |
-
"</div>"
|
194 |
-
],
|
195 |
-
"text/plain": [
|
196 |
-
" permalink \\\n",
|
197 |
-
"0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n",
|
198 |
-
"1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n",
|
199 |
-
"2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n",
|
200 |
-
"3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n",
|
201 |
-
"4 /r/AskReddit/comments/19hbjk/female_redditors_... \n",
|
202 |
-
"\n",
|
203 |
-
" selftext \\\n",
|
204 |
-
"0 Basically, do other parts of the world have th... \n",
|
205 |
-
"1 \n",
|
206 |
-
"2 Yeah.\n",
|
207 |
-
"\n",
|
208 |
-
"How do I reddit? I don't use or read re... \n",
|
209 |
-
"3 My brothers, dad and I have always been huge L... \n",
|
210 |
-
"4 I'm curious, guys tend to get asked the usual ... \n",
|
211 |
-
"\n",
|
212 |
-
" url created_utc \\\n",
|
213 |
-
"0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n",
|
214 |
-
"1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n",
|
215 |
-
"2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n",
|
216 |
-
"3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n",
|
217 |
-
"4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n",
|
218 |
-
"\n",
|
219 |
-
" author num_comments score \\\n",
|
220 |
-
"0 sjr63 1 1 \n",
|
221 |
-
"1 WeirdPlane 13 1 \n",
|
222 |
-
"2 xxnovaroxgg 14 0 \n",
|
223 |
-
"3 tuffstough 0 1 \n",
|
224 |
-
"4 redditredditx3 13 2 \n",
|
225 |
-
"\n",
|
226 |
-
" title id downs ups \n",
|
227 |
-
"0 In the way that popular English and American m... 19hbm0 0 1 \n",
|
228 |
-
"1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n",
|
229 |
-
"2 How do I reddit 19hblj 0 0 \n",
|
230 |
-
"3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n",
|
231 |
-
"4 Female Redditors, which part of the male physi... 19hbjk 0 2 "
|
232 |
-
]
|
233 |
-
},
|
234 |
-
"execution_count": 5,
|
235 |
-
"metadata": {},
|
236 |
-
"output_type": "execute_result"
|
237 |
-
}
|
238 |
-
],
|
239 |
-
"source": [
|
240 |
-
"df = submissions_to_dataframe(submissions)\n",
|
241 |
-
"df.head()"
|
242 |
-
]
|
243 |
-
},
|
244 |
-
{
|
245 |
-
"cell_type": "code",
|
246 |
-
"execution_count": null,
|
247 |
-
"id": "518addff",
|
248 |
-
"metadata": {},
|
249 |
-
"outputs": [],
|
250 |
-
"source": []
|
251 |
-
},
|
252 |
-
{
|
253 |
-
"cell_type": "code",
|
254 |
-
"execution_count": null,
|
255 |
-
"id": "6e5490dc",
|
256 |
-
"metadata": {},
|
257 |
-
"outputs": [],
|
258 |
-
"source": [
|
259 |
-
"start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
|
260 |
-
"start_date"
|
261 |
-
]
|
262 |
-
},
|
263 |
-
{
|
264 |
-
"cell_type": "code",
|
265 |
-
"execution_count": null,
|
266 |
-
"id": "bf13555a",
|
267 |
-
"metadata": {},
|
268 |
-
"outputs": [],
|
269 |
-
"source": [
|
270 |
-
"df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
|
271 |
-
]
|
272 |
-
},
|
273 |
-
{
|
274 |
-
"cell_type": "code",
|
275 |
-
"execution_count": null,
|
276 |
-
"id": "48e413f3",
|
277 |
-
"metadata": {},
|
278 |
-
"outputs": [],
|
279 |
-
"source": [
|
280 |
-
"df.head()"
|
281 |
-
]
|
282 |
-
},
|
283 |
-
{
|
284 |
-
"cell_type": "code",
|
285 |
-
"execution_count": null,
|
286 |
-
"id": "9e83befa",
|
287 |
-
"metadata": {},
|
288 |
-
"outputs": [],
|
289 |
-
"source": [
|
290 |
-
"df.dtypes"
|
291 |
-
]
|
292 |
-
},
|
293 |
-
{
|
294 |
-
"cell_type": "code",
|
295 |
-
"execution_count": null,
|
296 |
-
"id": "ba84be68",
|
297 |
-
"metadata": {},
|
298 |
-
"outputs": [],
|
299 |
-
"source": []
|
300 |
-
}
|
301 |
-
],
|
302 |
-
"metadata": {
|
303 |
-
"kernelspec": {
|
304 |
-
"display_name": "Python 3 (ipykernel)",
|
305 |
-
"language": "python",
|
306 |
-
"name": "python3"
|
307 |
-
},
|
308 |
-
"language_info": {
|
309 |
-
"codemirror_mode": {
|
310 |
-
"name": "ipython",
|
311 |
-
"version": 3
|
312 |
-
},
|
313 |
-
"file_extension": ".py",
|
314 |
-
"mimetype": "text/x-python",
|
315 |
-
"name": "python",
|
316 |
-
"nbconvert_exporter": "python",
|
317 |
-
"pygments_lexer": "ipython3",
|
318 |
-
"version": "3.9.16"
|
319 |
-
}
|
320 |
-
},
|
321 |
-
"nbformat": 4,
|
322 |
-
"nbformat_minor": 5
|
323 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/validate.ipynb
DELETED
@@ -1,617 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 1,
|
6 |
-
"id": "730ba509",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"from IPython.core.interactiveshell import InteractiveShell\n",
|
11 |
-
"InteractiveShell.ast_node_interactivity = \"all\""
|
12 |
-
]
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"cell_type": "code",
|
16 |
-
"execution_count": 2,
|
17 |
-
"id": "d9acd4b6",
|
18 |
-
"metadata": {},
|
19 |
-
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"from pathlib import Path\n",
|
22 |
-
"import sys\n",
|
23 |
-
"proj_dir = Path.cwd().parent\n",
|
24 |
-
"\n",
|
25 |
-
"sys.path.append(str(proj_dir))\n"
|
26 |
-
]
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"cell_type": "code",
|
30 |
-
"execution_count": 4,
|
31 |
-
"id": "62452860",
|
32 |
-
"metadata": {},
|
33 |
-
"outputs": [],
|
34 |
-
"source": [
|
35 |
-
"from datasets import load_dataset"
|
36 |
-
]
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"cell_type": "code",
|
40 |
-
"execution_count": 28,
|
41 |
-
"id": "00affc9a",
|
42 |
-
"metadata": {},
|
43 |
-
"outputs": [
|
44 |
-
{
|
45 |
-
"data": {
|
46 |
-
"application/vnd.jupyter.widget-view+json": {
|
47 |
-
"model_id": "a106bb47c1194b15bc289d2ef24258af",
|
48 |
-
"version_major": 2,
|
49 |
-
"version_minor": 0
|
50 |
-
},
|
51 |
-
"text/plain": [
|
52 |
-
"Downloading readme: 0%| | 0.00/804 [00:00<?, ?B/s]"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
"metadata": {},
|
56 |
-
"output_type": "display_data"
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"name": "stderr",
|
60 |
-
"output_type": "stream",
|
61 |
-
"text": [
|
62 |
-
"Using custom data configuration derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16\n"
|
63 |
-
]
|
64 |
-
},
|
65 |
-
{
|
66 |
-
"name": "stdout",
|
67 |
-
"output_type": "stream",
|
68 |
-
"text": [
|
69 |
-
"Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"data": {
|
74 |
-
"application/vnd.jupyter.widget-view+json": {
|
75 |
-
"model_id": "705d55e70bf442f98a51dd0618a5c2c6",
|
76 |
-
"version_major": 2,
|
77 |
-
"version_minor": 0
|
78 |
-
},
|
79 |
-
"text/plain": [
|
80 |
-
"Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
81 |
-
]
|
82 |
-
},
|
83 |
-
"metadata": {},
|
84 |
-
"output_type": "display_data"
|
85 |
-
},
|
86 |
-
{
|
87 |
-
"data": {
|
88 |
-
"application/vnd.jupyter.widget-view+json": {
|
89 |
-
"model_id": "139220a81674444997f7657a4c2e1a01",
|
90 |
-
"version_major": 2,
|
91 |
-
"version_minor": 0
|
92 |
-
},
|
93 |
-
"text/plain": [
|
94 |
-
"Downloading data: 0%| | 0.00/702k [00:00<?, ?B/s]"
|
95 |
-
]
|
96 |
-
},
|
97 |
-
"metadata": {},
|
98 |
-
"output_type": "display_data"
|
99 |
-
},
|
100 |
-
{
|
101 |
-
"data": {
|
102 |
-
"application/vnd.jupyter.widget-view+json": {
|
103 |
-
"model_id": "1a361406937144cebd4ff6168e56ec3d",
|
104 |
-
"version_major": 2,
|
105 |
-
"version_minor": 0
|
106 |
-
},
|
107 |
-
"text/plain": [
|
108 |
-
"Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
109 |
-
]
|
110 |
-
},
|
111 |
-
"metadata": {},
|
112 |
-
"output_type": "display_data"
|
113 |
-
},
|
114 |
-
{
|
115 |
-
"data": {
|
116 |
-
"application/vnd.jupyter.widget-view+json": {
|
117 |
-
"model_id": "",
|
118 |
-
"version_major": 2,
|
119 |
-
"version_minor": 0
|
120 |
-
},
|
121 |
-
"text/plain": [
|
122 |
-
"Generating all_days split: 0%| | 0/3272 [00:00<?, ? examples/s]"
|
123 |
-
]
|
124 |
-
},
|
125 |
-
"metadata": {},
|
126 |
-
"output_type": "display_data"
|
127 |
-
},
|
128 |
-
{
|
129 |
-
"name": "stdout",
|
130 |
-
"output_type": "stream",
|
131 |
-
"text": [
|
132 |
-
"Dataset parquet downloaded and prepared to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
|
133 |
-
]
|
134 |
-
},
|
135 |
-
{
|
136 |
-
"data": {
|
137 |
-
"application/vnd.jupyter.widget-view+json": {
|
138 |
-
"model_id": "4df7107473904386aebd66c543858abd",
|
139 |
-
"version_major": 2,
|
140 |
-
"version_minor": 0
|
141 |
-
},
|
142 |
-
"text/plain": [
|
143 |
-
" 0%| | 0/1 [00:00<?, ?it/s]"
|
144 |
-
]
|
145 |
-
},
|
146 |
-
"metadata": {},
|
147 |
-
"output_type": "display_data"
|
148 |
-
}
|
149 |
-
],
|
150 |
-
"source": [
|
151 |
-
"dataset = load_dataset('derek-thomas/dataset-creator-askreddit', download_mode=\"reuse_cache_if_exists\", ignore_verifications=True)"
|
152 |
-
]
|
153 |
-
},
|
154 |
-
{
|
155 |
-
"cell_type": "code",
|
156 |
-
"execution_count": 29,
|
157 |
-
"id": "ba84be68",
|
158 |
-
"metadata": {},
|
159 |
-
"outputs": [
|
160 |
-
{
|
161 |
-
"data": {
|
162 |
-
"text/html": [
|
163 |
-
"<div>\n",
|
164 |
-
"<style scoped>\n",
|
165 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
166 |
-
" vertical-align: middle;\n",
|
167 |
-
" }\n",
|
168 |
-
"\n",
|
169 |
-
" .dataframe tbody tr th {\n",
|
170 |
-
" vertical-align: top;\n",
|
171 |
-
" }\n",
|
172 |
-
"\n",
|
173 |
-
" .dataframe thead th {\n",
|
174 |
-
" text-align: right;\n",
|
175 |
-
" }\n",
|
176 |
-
"</style>\n",
|
177 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
178 |
-
" <thead>\n",
|
179 |
-
" <tr style=\"text-align: right;\">\n",
|
180 |
-
" <th></th>\n",
|
181 |
-
" <th>score</th>\n",
|
182 |
-
" <th>num_comments</th>\n",
|
183 |
-
" <th>title</th>\n",
|
184 |
-
" <th>permalink</th>\n",
|
185 |
-
" <th>selftext</th>\n",
|
186 |
-
" <th>url</th>\n",
|
187 |
-
" <th>created_utc</th>\n",
|
188 |
-
" <th>author</th>\n",
|
189 |
-
" <th>id</th>\n",
|
190 |
-
" <th>downs</th>\n",
|
191 |
-
" <th>ups</th>\n",
|
192 |
-
" <th>date</th>\n",
|
193 |
-
" <th>time</th>\n",
|
194 |
-
" </tr>\n",
|
195 |
-
" </thead>\n",
|
196 |
-
" <tbody>\n",
|
197 |
-
" <tr>\n",
|
198 |
-
" <th>0</th>\n",
|
199 |
-
" <td>2</td>\n",
|
200 |
-
" <td>4</td>\n",
|
201 |
-
" <td>Reddit, if someone had to describe you to a st...</td>\n",
|
202 |
-
" <td>/r/AskReddit/comments/15sn6y/reddit_if_someone...</td>\n",
|
203 |
-
" <td>They would be talking about you without your p...</td>\n",
|
204 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
205 |
-
" <td>2013-01-01 23:59:40+00:00</td>\n",
|
206 |
-
" <td>[deleted]</td>\n",
|
207 |
-
" <td>15sn6y</td>\n",
|
208 |
-
" <td>0</td>\n",
|
209 |
-
" <td>2</td>\n",
|
210 |
-
" <td>2013-01-01</td>\n",
|
211 |
-
" <td>23:59:40</td>\n",
|
212 |
-
" </tr>\n",
|
213 |
-
" <tr>\n",
|
214 |
-
" <th>1</th>\n",
|
215 |
-
" <td>5</td>\n",
|
216 |
-
" <td>24</td>\n",
|
217 |
-
" <td>What kind of car does the average \\nRedditor d...</td>\n",
|
218 |
-
" <td>/r/AskReddit/comments/15sn6m/what_kind_of_car_...</td>\n",
|
219 |
-
" <td>I've always wanted to know what kind of car th...</td>\n",
|
220 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
221 |
-
" <td>2013-01-01 23:59:31+00:00</td>\n",
|
222 |
-
" <td>PaytonAdams</td>\n",
|
223 |
-
" <td>15sn6m</td>\n",
|
224 |
-
" <td>0</td>\n",
|
225 |
-
" <td>5</td>\n",
|
226 |
-
" <td>2013-01-01</td>\n",
|
227 |
-
" <td>23:59:31</td>\n",
|
228 |
-
" </tr>\n",
|
229 |
-
" <tr>\n",
|
230 |
-
" <th>2</th>\n",
|
231 |
-
" <td>1</td>\n",
|
232 |
-
" <td>5</td>\n",
|
233 |
-
" <td>What movies have made you go back to the theat...</td>\n",
|
234 |
-
" <td>/r/AskReddit/comments/15sn6b/what_movies_have_...</td>\n",
|
235 |
-
" <td></td>\n",
|
236 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
237 |
-
" <td>2013-01-01 23:59:20+00:00</td>\n",
|
238 |
-
" <td>[deleted]</td>\n",
|
239 |
-
" <td>15sn6b</td>\n",
|
240 |
-
" <td>0</td>\n",
|
241 |
-
" <td>1</td>\n",
|
242 |
-
" <td>2013-01-01</td>\n",
|
243 |
-
" <td>23:59:20</td>\n",
|
244 |
-
" </tr>\n",
|
245 |
-
" <tr>\n",
|
246 |
-
" <th>3</th>\n",
|
247 |
-
" <td>0</td>\n",
|
248 |
-
" <td>18</td>\n",
|
249 |
-
" <td>Worst fear(s)?</td>\n",
|
250 |
-
" <td>/r/AskReddit/comments/15sn4u/worst_fears/</td>\n",
|
251 |
-
" <td>So what is your worst fear, reddit?</td>\n",
|
252 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
253 |
-
" <td>2013-01-01 23:58:37+00:00</td>\n",
|
254 |
-
" <td>[deleted]</td>\n",
|
255 |
-
" <td>15sn4u</td>\n",
|
256 |
-
" <td>0</td>\n",
|
257 |
-
" <td>0</td>\n",
|
258 |
-
" <td>2013-01-01</td>\n",
|
259 |
-
" <td>23:58:37</td>\n",
|
260 |
-
" </tr>\n",
|
261 |
-
" <tr>\n",
|
262 |
-
" <th>4</th>\n",
|
263 |
-
" <td>11</td>\n",
|
264 |
-
" <td>29</td>\n",
|
265 |
-
" <td>If there was a type of ink that lasted only fo...</td>\n",
|
266 |
-
" <td>/r/AskReddit/comments/15sn44/if_there_was_a_ty...</td>\n",
|
267 |
-
" <td></td>\n",
|
268 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
269 |
-
" <td>2013-01-01 23:58:15+00:00</td>\n",
|
270 |
-
" <td>Honeybeard</td>\n",
|
271 |
-
" <td>15sn44</td>\n",
|
272 |
-
" <td>0</td>\n",
|
273 |
-
" <td>11</td>\n",
|
274 |
-
" <td>2013-01-01</td>\n",
|
275 |
-
" <td>23:58:15</td>\n",
|
276 |
-
" </tr>\n",
|
277 |
-
" <tr>\n",
|
278 |
-
" <th>...</th>\n",
|
279 |
-
" <td>...</td>\n",
|
280 |
-
" <td>...</td>\n",
|
281 |
-
" <td>...</td>\n",
|
282 |
-
" <td>...</td>\n",
|
283 |
-
" <td>...</td>\n",
|
284 |
-
" <td>...</td>\n",
|
285 |
-
" <td>...</td>\n",
|
286 |
-
" <td>...</td>\n",
|
287 |
-
" <td>...</td>\n",
|
288 |
-
" <td>...</td>\n",
|
289 |
-
" <td>...</td>\n",
|
290 |
-
" <td>...</td>\n",
|
291 |
-
" <td>...</td>\n",
|
292 |
-
" </tr>\n",
|
293 |
-
" <tr>\n",
|
294 |
-
" <th>3267</th>\n",
|
295 |
-
" <td>0</td>\n",
|
296 |
-
" <td>11</td>\n",
|
297 |
-
" <td>Smokers of Reddit- What are your reasons for s...</td>\n",
|
298 |
-
" <td>/r/AskReddit/comments/15qzen/smokers_of_reddit...</td>\n",
|
299 |
-
" <td>I'm very curious as to what causes someone to ...</td>\n",
|
300 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
301 |
-
" <td>2013-01-01 00:01:36+00:00</td>\n",
|
302 |
-
" <td>kelsofb</td>\n",
|
303 |
-
" <td>15qzen</td>\n",
|
304 |
-
" <td>0</td>\n",
|
305 |
-
" <td>0</td>\n",
|
306 |
-
" <td>2013-01-01</td>\n",
|
307 |
-
" <td>00:01:36</td>\n",
|
308 |
-
" </tr>\n",
|
309 |
-
" <tr>\n",
|
310 |
-
" <th>3268</th>\n",
|
311 |
-
" <td>1</td>\n",
|
312 |
-
" <td>4</td>\n",
|
313 |
-
" <td>Hi</td>\n",
|
314 |
-
" <td>/r/AskReddit/comments/15qzei/hi/</td>\n",
|
315 |
-
" <td></td>\n",
|
316 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
317 |
-
" <td>2013-01-01 00:01:34+00:00</td>\n",
|
318 |
-
" <td>ImJE5US</td>\n",
|
319 |
-
" <td>15qzei</td>\n",
|
320 |
-
" <td>0</td>\n",
|
321 |
-
" <td>1</td>\n",
|
322 |
-
" <td>2013-01-01</td>\n",
|
323 |
-
" <td>00:01:34</td>\n",
|
324 |
-
" </tr>\n",
|
325 |
-
" <tr>\n",
|
326 |
-
" <th>3269</th>\n",
|
327 |
-
" <td>1</td>\n",
|
328 |
-
" <td>2</td>\n",
|
329 |
-
" <td>At the stroke of midnight I was writing this p...</td>\n",
|
330 |
-
" <td>/r/AskReddit/comments/15qzdx/at_the_stroke_of_...</td>\n",
|
331 |
-
" <td></td>\n",
|
332 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
333 |
-
" <td>2013-01-01 00:01:15+00:00</td>\n",
|
334 |
-
" <td>Sangfroid_Sonder</td>\n",
|
335 |
-
" <td>15qzdx</td>\n",
|
336 |
-
" <td>0</td>\n",
|
337 |
-
" <td>1</td>\n",
|
338 |
-
" <td>2013-01-01</td>\n",
|
339 |
-
" <td>00:01:15</td>\n",
|
340 |
-
" </tr>\n",
|
341 |
-
" <tr>\n",
|
342 |
-
" <th>3270</th>\n",
|
343 |
-
" <td>1</td>\n",
|
344 |
-
" <td>2</td>\n",
|
345 |
-
" <td>With all the rape stories in the news, why don...</td>\n",
|
346 |
-
" <td>/r/AskReddit/comments/15qzdc/with_all_the_rape...</td>\n",
|
347 |
-
" <td></td>\n",
|
348 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
349 |
-
" <td>2013-01-01 00:00:58+00:00</td>\n",
|
350 |
-
" <td>[deleted]</td>\n",
|
351 |
-
" <td>15qzdc</td>\n",
|
352 |
-
" <td>0</td>\n",
|
353 |
-
" <td>1</td>\n",
|
354 |
-
" <td>2013-01-01</td>\n",
|
355 |
-
" <td>00:00:58</td>\n",
|
356 |
-
" </tr>\n",
|
357 |
-
" <tr>\n",
|
358 |
-
" <th>3271</th>\n",
|
359 |
-
" <td>0</td>\n",
|
360 |
-
" <td>3</td>\n",
|
361 |
-
" <td>Do beautiful people have low entropy?</td>\n",
|
362 |
-
" <td>/r/AskReddit/comments/15qzd3/do_beautiful_peop...</td>\n",
|
363 |
-
" <td>I have been reading about entropy and arrows o...</td>\n",
|
364 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
365 |
-
" <td>2013-01-01 00:00:53+00:00</td>\n",
|
366 |
-
" <td>[deleted]</td>\n",
|
367 |
-
" <td>15qzd3</td>\n",
|
368 |
-
" <td>0</td>\n",
|
369 |
-
" <td>0</td>\n",
|
370 |
-
" <td>2013-01-01</td>\n",
|
371 |
-
" <td>00:00:53</td>\n",
|
372 |
-
" </tr>\n",
|
373 |
-
" </tbody>\n",
|
374 |
-
"</table>\n",
|
375 |
-
"<p>3272 rows × 13 columns</p>\n",
|
376 |
-
"</div>"
|
377 |
-
],
|
378 |
-
"text/plain": [
|
379 |
-
" score num_comments title \\\n",
|
380 |
-
"0 2 4 Reddit, if someone had to describe you to a st... \n",
|
381 |
-
"1 5 24 What kind of car does the average \\nRedditor d... \n",
|
382 |
-
"2 1 5 What movies have made you go back to the theat... \n",
|
383 |
-
"3 0 18 Worst fear(s)? \n",
|
384 |
-
"4 11 29 If there was a type of ink that lasted only fo... \n",
|
385 |
-
"... ... ... ... \n",
|
386 |
-
"3267 0 11 Smokers of Reddit- What are your reasons for s... \n",
|
387 |
-
"3268 1 4 Hi \n",
|
388 |
-
"3269 1 2 At the stroke of midnight I was writing this p... \n",
|
389 |
-
"3270 1 2 With all the rape stories in the news, why don... \n",
|
390 |
-
"3271 0 3 Do beautiful people have low entropy? \n",
|
391 |
-
"\n",
|
392 |
-
" permalink \\\n",
|
393 |
-
"0 /r/AskReddit/comments/15sn6y/reddit_if_someone... \n",
|
394 |
-
"1 /r/AskReddit/comments/15sn6m/what_kind_of_car_... \n",
|
395 |
-
"2 /r/AskReddit/comments/15sn6b/what_movies_have_... \n",
|
396 |
-
"3 /r/AskReddit/comments/15sn4u/worst_fears/ \n",
|
397 |
-
"4 /r/AskReddit/comments/15sn44/if_there_was_a_ty... \n",
|
398 |
-
"... ... \n",
|
399 |
-
"3267 /r/AskReddit/comments/15qzen/smokers_of_reddit... \n",
|
400 |
-
"3268 /r/AskReddit/comments/15qzei/hi/ \n",
|
401 |
-
"3269 /r/AskReddit/comments/15qzdx/at_the_stroke_of_... \n",
|
402 |
-
"3270 /r/AskReddit/comments/15qzdc/with_all_the_rape... \n",
|
403 |
-
"3271 /r/AskReddit/comments/15qzd3/do_beautiful_peop... \n",
|
404 |
-
"\n",
|
405 |
-
" selftext \\\n",
|
406 |
-
"0 They would be talking about you without your p... \n",
|
407 |
-
"1 I've always wanted to know what kind of car th... \n",
|
408 |
-
"2 \n",
|
409 |
-
"3 So what is your worst fear, reddit? \n",
|
410 |
-
"4 \n",
|
411 |
-
"... ... \n",
|
412 |
-
"3267 I'm very curious as to what causes someone to ... \n",
|
413 |
-
"3268 \n",
|
414 |
-
"3269 \n",
|
415 |
-
"3270 \n",
|
416 |
-
"3271 I have been reading about entropy and arrows o... \n",
|
417 |
-
"\n",
|
418 |
-
" url \\\n",
|
419 |
-
"0 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
420 |
-
"1 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
421 |
-
"2 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
422 |
-
"3 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
423 |
-
"4 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
424 |
-
"... ... \n",
|
425 |
-
"3267 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
426 |
-
"3268 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
427 |
-
"3269 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
428 |
-
"3270 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
429 |
-
"3271 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
430 |
-
"\n",
|
431 |
-
" created_utc author id downs ups \\\n",
|
432 |
-
"0 2013-01-01 23:59:40+00:00 [deleted] 15sn6y 0 2 \n",
|
433 |
-
"1 2013-01-01 23:59:31+00:00 PaytonAdams 15sn6m 0 5 \n",
|
434 |
-
"2 2013-01-01 23:59:20+00:00 [deleted] 15sn6b 0 1 \n",
|
435 |
-
"3 2013-01-01 23:58:37+00:00 [deleted] 15sn4u 0 0 \n",
|
436 |
-
"4 2013-01-01 23:58:15+00:00 Honeybeard 15sn44 0 11 \n",
|
437 |
-
"... ... ... ... ... ... \n",
|
438 |
-
"3267 2013-01-01 00:01:36+00:00 kelsofb 15qzen 0 0 \n",
|
439 |
-
"3268 2013-01-01 00:01:34+00:00 ImJE5US 15qzei 0 1 \n",
|
440 |
-
"3269 2013-01-01 00:01:15+00:00 Sangfroid_Sonder 15qzdx 0 1 \n",
|
441 |
-
"3270 2013-01-01 00:00:58+00:00 [deleted] 15qzdc 0 1 \n",
|
442 |
-
"3271 2013-01-01 00:00:53+00:00 [deleted] 15qzd3 0 0 \n",
|
443 |
-
"\n",
|
444 |
-
" date time \n",
|
445 |
-
"0 2013-01-01 23:59:40 \n",
|
446 |
-
"1 2013-01-01 23:59:31 \n",
|
447 |
-
"2 2013-01-01 23:59:20 \n",
|
448 |
-
"3 2013-01-01 23:58:37 \n",
|
449 |
-
"4 2013-01-01 23:58:15 \n",
|
450 |
-
"... ... ... \n",
|
451 |
-
"3267 2013-01-01 00:01:36 \n",
|
452 |
-
"3268 2013-01-01 00:01:34 \n",
|
453 |
-
"3269 2013-01-01 00:01:15 \n",
|
454 |
-
"3270 2013-01-01 00:00:58 \n",
|
455 |
-
"3271 2013-01-01 00:00:53 \n",
|
456 |
-
"\n",
|
457 |
-
"[3272 rows x 13 columns]"
|
458 |
-
]
|
459 |
-
},
|
460 |
-
"execution_count": 29,
|
461 |
-
"metadata": {},
|
462 |
-
"output_type": "execute_result"
|
463 |
-
}
|
464 |
-
],
|
465 |
-
"source": [
|
466 |
-
"df = dataset['all_days'].to_pandas()\n",
|
467 |
-
"df"
|
468 |
-
]
|
469 |
-
},
|
470 |
-
{
|
471 |
-
"cell_type": "code",
|
472 |
-
"execution_count": 16,
|
473 |
-
"id": "28df4b06",
|
474 |
-
"metadata": {},
|
475 |
-
"outputs": [
|
476 |
-
{
|
477 |
-
"data": {
|
478 |
-
"text/plain": [
|
479 |
-
"score Int64\n",
|
480 |
-
"num_comments Int64\n",
|
481 |
-
"title string\n",
|
482 |
-
"permalink string\n",
|
483 |
-
"selftext string\n",
|
484 |
-
"url string\n",
|
485 |
-
"created_utc string\n",
|
486 |
-
"author string\n",
|
487 |
-
"id string\n",
|
488 |
-
"downs Int64\n",
|
489 |
-
"ups Int64\n",
|
490 |
-
"dtype: object"
|
491 |
-
]
|
492 |
-
},
|
493 |
-
"execution_count": 16,
|
494 |
-
"metadata": {},
|
495 |
-
"output_type": "execute_result"
|
496 |
-
}
|
497 |
-
],
|
498 |
-
"source": [
|
499 |
-
"df.convert_dtypes().dtypes"
|
500 |
-
]
|
501 |
-
},
|
502 |
-
{
|
503 |
-
"cell_type": "code",
|
504 |
-
"execution_count": 18,
|
505 |
-
"id": "e322b6c0",
|
506 |
-
"metadata": {},
|
507 |
-
"outputs": [],
|
508 |
-
"source": [
|
509 |
-
"import pandas as pd"
|
510 |
-
]
|
511 |
-
},
|
512 |
-
{
|
513 |
-
"cell_type": "code",
|
514 |
-
"execution_count": 21,
|
515 |
-
"id": "ed1b06c3",
|
516 |
-
"metadata": {},
|
517 |
-
"outputs": [],
|
518 |
-
"source": [
|
519 |
-
"df['created_utc'] = pd.to_datetime(df['created_utc'])\n",
|
520 |
-
"df['date'] = df['created_utc'].dt.date\n",
|
521 |
-
"df['time'] = df['created_utc'].dt.time"
|
522 |
-
]
|
523 |
-
},
|
524 |
-
{
|
525 |
-
"cell_type": "code",
|
526 |
-
"execution_count": 33,
|
527 |
-
"id": "ff477737",
|
528 |
-
"metadata": {},
|
529 |
-
"outputs": [
|
530 |
-
{
|
531 |
-
"data": {
|
532 |
-
"text/plain": [
|
533 |
-
"2013-01-01 3272\n",
|
534 |
-
"Name: date, dtype: int64"
|
535 |
-
]
|
536 |
-
},
|
537 |
-
"execution_count": 33,
|
538 |
-
"metadata": {},
|
539 |
-
"output_type": "execute_result"
|
540 |
-
}
|
541 |
-
],
|
542 |
-
"source": [
|
543 |
-
"df.date.value_counts()"
|
544 |
-
]
|
545 |
-
},
|
546 |
-
{
|
547 |
-
"cell_type": "code",
|
548 |
-
"execution_count": 26,
|
549 |
-
"id": "1d11b967",
|
550 |
-
"metadata": {},
|
551 |
-
"outputs": [],
|
552 |
-
"source": [
|
553 |
-
"new_df = df.drop_duplicates(subset=['id'], keep=\"first\")"
|
554 |
-
]
|
555 |
-
},
|
556 |
-
{
|
557 |
-
"cell_type": "code",
|
558 |
-
"execution_count": 27,
|
559 |
-
"id": "eec00dd6",
|
560 |
-
"metadata": {},
|
561 |
-
"outputs": [
|
562 |
-
{
|
563 |
-
"data": {
|
564 |
-
"text/plain": [
|
565 |
-
"<Axes: >"
|
566 |
-
]
|
567 |
-
},
|
568 |
-
"execution_count": 27,
|
569 |
-
"metadata": {},
|
570 |
-
"output_type": "execute_result"
|
571 |
-
},
|
572 |
-
{
|
573 |
-
"data": {
|
574 |
-
"image/png": "\n",
|
575 |
-
"text/plain": [
|
576 |
-
"<Figure size 640x480 with 1 Axes>"
|
577 |
-
]
|
578 |
-
},
|
579 |
-
"metadata": {},
|
580 |
-
"output_type": "display_data"
|
581 |
-
}
|
582 |
-
],
|
583 |
-
"source": [
|
584 |
-
"new_df.date.hist(bins=400)"
|
585 |
-
]
|
586 |
-
},
|
587 |
-
{
|
588 |
-
"cell_type": "code",
|
589 |
-
"execution_count": null,
|
590 |
-
"id": "1acf60dc",
|
591 |
-
"metadata": {},
|
592 |
-
"outputs": [],
|
593 |
-
"source": []
|
594 |
-
}
|
595 |
-
],
|
596 |
-
"metadata": {
|
597 |
-
"kernelspec": {
|
598 |
-
"display_name": "Python 3 (ipykernel)",
|
599 |
-
"language": "python",
|
600 |
-
"name": "python3"
|
601 |
-
},
|
602 |
-
"language_info": {
|
603 |
-
"codemirror_mode": {
|
604 |
-
"name": "ipython",
|
605 |
-
"version": 3
|
606 |
-
},
|
607 |
-
"file_extension": ".py",
|
608 |
-
"mimetype": "text/x-python",
|
609 |
-
"name": "python",
|
610 |
-
"nbconvert_exporter": "python",
|
611 |
-
"pygments_lexer": "ipython3",
|
612 |
-
"version": "3.10.8"
|
613 |
-
}
|
614 |
-
},
|
615 |
-
"nbformat": 4,
|
616 |
-
"nbformat_minor": 5
|
617 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
praw==7.7.
|
2 |
-
gradio==3.
|
3 |
nbdev==2.3.12
|
4 |
-
datasets==2.
|
5 |
requests==2.28.2
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
-
|
9 |
-
|
|
|
1 |
+
praw==7.7.1
|
2 |
+
gradio==3.50.2
|
3 |
nbdev==2.3.12
|
4 |
+
datasets==2.14.6
|
5 |
requests==2.28.2
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
+
supervisor==4.2.5
|
9 |
+
schedule==1.2.0
|
utilities/data_collator.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from utilities.praw_downloader import praw_downloader
|
4 |
+
from utilities.praw_processor import preprocess_praw_data
|
5 |
+
|
6 |
+
|
7 |
+
def get_latest_data():
|
8 |
+
submissions = praw_downloader()
|
9 |
+
df = preprocess_praw_data(submissions=submissions)
|
10 |
+
return df
|
11 |
+
|
12 |
+
|
13 |
+
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
+
"""
|
15 |
+
Removes rows with redundant ids, retaining the one with the longest content.
|
16 |
+
|
17 |
+
Parameters:
|
18 |
+
- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
|
22 |
+
with the longest content available.
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Create a column for content length
|
26 |
+
df['content_length'] = df['content'].str.len()
|
27 |
+
|
28 |
+
# Use groupby to get the index of the row with the longest content for each 'id'
|
29 |
+
idx_to_keep = df.groupby('id')['content_length'].idxmax().values
|
30 |
+
|
31 |
+
# Filter the DataFrame to only keep those rows
|
32 |
+
df_filtered = df.loc[idx_to_keep]
|
33 |
+
|
34 |
+
# Drop the 'content_length' column
|
35 |
+
df_filtered = df_filtered.drop(columns=['content_length'])
|
36 |
+
|
37 |
+
return df_filtered
|
38 |
+
|
39 |
+
|
40 |
+
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
41 |
+
"""
|
42 |
+
Merges the provided dataset with the latest data, sorts them by 'date_utc',
|
43 |
+
filters out redundant IDs, and returns the merged and filtered dataset.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
- dataset (Type[Dataset]): The dataset to be merged with the latest data.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
- Type[Dataset]: The merged and filtered dataset.
|
50 |
+
"""
|
51 |
+
latest_df = get_latest_data()
|
52 |
+
|
53 |
+
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
54 |
+
df = filter_redundant_ids(df)
|
55 |
+
return df
|
utilities/my_logger.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
|
4 |
+
def setup_logger(name: str):
|
5 |
+
logger = logging.getLogger(name)
|
6 |
+
logger.setLevel(logging.DEBUG)
|
7 |
+
|
8 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
9 |
+
|
10 |
+
# Create a file handler to write logs to a file
|
11 |
+
file_handler = logging.FileHandler('mylog.log')
|
12 |
+
file_handler.setLevel(logging.DEBUG)
|
13 |
+
file_handler.setFormatter(formatter)
|
14 |
+
logger.addHandler(file_handler)
|
15 |
+
|
16 |
+
# Create a stream handler to write logs to the console
|
17 |
+
stream_handler = logging.StreamHandler()
|
18 |
+
stream_handler.setLevel(logging.DEBUG)
|
19 |
+
stream_handler.setFormatter(formatter)
|
20 |
+
logger.addHandler(stream_handler)
|
21 |
+
|
22 |
+
return logger
|
utilities/praw_downloader.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
from typing import Any, Dict, List
|
4 |
+
|
5 |
+
import praw
|
6 |
+
|
7 |
+
from utilities.my_logger import setup_logger
|
8 |
+
|
9 |
+
# Setup logging
|
10 |
+
logger = setup_logger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_reddit_instance() -> praw.Reddit:
|
14 |
+
"""Initialize and return a Reddit instance using PRAW."""
|
15 |
+
return praw.Reddit(
|
16 |
+
client_id=os.getenv('REDDIT_CLIENT_ID'),
|
17 |
+
client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
|
18 |
+
user_agent=os.getenv('REDDIT_USER_AGENT'),
|
19 |
+
ratelimit_seconds=20,
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]:
|
24 |
+
"""Extract and return relevant data from a given Reddit submission."""
|
25 |
+
return {
|
26 |
+
"content": submission.selftext,
|
27 |
+
"poster": str(submission.author),
|
28 |
+
"date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
|
29 |
+
"flair": submission.link_flair_text,
|
30 |
+
"title": submission.title,
|
31 |
+
"score": submission.ups,
|
32 |
+
"permalink": submission.permalink,
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def praw_downloader() -> List[Dict[str, str]]:
|
37 |
+
"""Main function to extract and save all submissions from the subreddit."""
|
38 |
+
reddit = get_reddit_instance()
|
39 |
+
subreddit = reddit.subreddit('bestofredditorupdates')
|
40 |
+
|
41 |
+
logger.info('Starting to fetch submissions from bestofredditorupdates.')
|
42 |
+
|
43 |
+
submissions = []
|
44 |
+
for submission in subreddit.new(limit=200): # Set limit=None to get all posts
|
45 |
+
logger.debug(f'Processing post {submission.id} - {submission.title}')
|
46 |
+
data = extract_submission_data(submission)
|
47 |
+
submissions.append(data)
|
48 |
+
|
49 |
+
logger.info(f'Finished downloading {len(submissions)} submissions.')
|
50 |
+
return submissions
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
praw_downloader()
|
utilities/praw_processor.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from utilities.my_logger import setup_logger
|
6 |
+
|
7 |
+
# Setup logging
|
8 |
+
logger = setup_logger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
|
12 |
+
"""
|
13 |
+
Preprocesses praw data into a DataFrame.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
- submissions: List of submission dictionaries.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
- pd.DataFrame: Preprocessed DataFrame.
|
20 |
+
"""
|
21 |
+
|
22 |
+
# Convert the submissions list to a DataFrame
|
23 |
+
praw_df = pd.DataFrame(submissions)
|
24 |
+
|
25 |
+
# Convert 'date' column to datetime format
|
26 |
+
praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
|
27 |
+
|
28 |
+
# Remove 'poster_link' column if it exists
|
29 |
+
if 'poster_link' in praw_df.columns:
|
30 |
+
del praw_df['poster_link']
|
31 |
+
|
32 |
+
# Extract the 4th element from 'permalink' as 'id'
|
33 |
+
praw_df['id'] = praw_df.permalink.str.split('/').str[4]
|
34 |
+
|
35 |
+
return praw_df
|
utilities/readme_update.py
CHANGED
@@ -10,24 +10,20 @@ def get_readme_path(dataset_name):
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
-
def update_readme(dataset_name, subreddit,
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
|
|
|
|
|
|
|
|
|
|
16 |
# Dataset Name
|
17 |
{dataset_name}
|
18 |
|
19 |
## Update Frequency
|
20 |
-
The dataset is updated daily
|
21 |
-
|
22 |
-
## Dataset Overview
|
23 |
-
The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
|
24 |
-
|
25 |
-
## Data Collection
|
26 |
-
This has been collected with sequential calls that follow the pagination of the pushshift request.
|
27 |
-
|
28 |
-
## Attribution
|
29 |
-
Data sourced from the Pushshift API.
|
30 |
-
"""
|
31 |
|
32 |
append_readme(path=path, readme_text=readme_text)
|
33 |
return readme_text
|
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
+
def update_readme(dataset_name, subreddit, latest_date):
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
16 |
+
## Dataset Overview
|
17 |
+
The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging PRAW and the reddit API to get downloads.
|
18 |
+
|
19 |
+
There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
|
20 |
+
|
21 |
# Dataset Name
|
22 |
{dataset_name}
|
23 |
|
24 |
## Update Frequency
|
25 |
+
The dataset is updated daily with the most recent day being: {latest_date}
|
26 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
append_readme(path=path, readme_text=readme_text)
|
29 |
return readme_text
|