Commit
•
285612d
1
Parent(s):
1d46c26
Major updates, moving away from pushshift.io into PRAW
Browse files- .gitignore +2 -1
- Dockerfile +6 -2
- app.py +2 -2
- archive/subreddit_downloader.py +0 -145
- main.py +40 -94
- media/reddit_scraper.drawio.html +0 -11
- media/reddit_scraper.drawio.png +0 -0
- notebooks/data_processing.ipynb +0 -0
- notebooks/explore.ipynb +0 -323
- notebooks/validate.ipynb +0 -617
- requirements.txt +5 -5
- utilities/data_collator.py +55 -0
- utilities/my_logger.py +22 -0
- utilities/praw_downloader.py +54 -0
- utilities/praw_processor.py +35 -0
- utilities/readme_update.py +8 -12
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
.idea/
|
2 |
notebooks/.ipynb_checkpoints
|
3 |
-
mylog.log
|
|
|
|
1 |
.idea/
|
2 |
notebooks/.ipynb_checkpoints
|
3 |
+
mylog.log
|
4 |
+
.env
|
Dockerfile
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
# Use the official Python base image
|
2 |
-
FROM python:3.
|
3 |
|
4 |
# Install Git LFS
|
5 |
-
RUN
|
|
|
|
|
6 |
|
7 |
# https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
|
8 |
RUN useradd -m -u 1000 user
|
@@ -29,7 +31,9 @@ COPY . .
|
|
29 |
COPY supervisord.conf .
|
30 |
|
31 |
# Set permissions on the log file
|
|
|
32 |
RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
|
|
|
33 |
# RUN mkdir -m 777 -p /.cache/huggingface/hub/
|
34 |
|
35 |
|
|
|
1 |
# Use the official Python base image
|
2 |
+
FROM python:3.10
|
3 |
|
4 |
# Install Git LFS
|
5 |
+
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
6 |
+
RUN apt-get -o Acquire::AllowInsecureRepositories=true update && apt-get install -y git-lfs
|
7 |
+
#RUN apt-get update && apt-get install -y git-lfs
|
8 |
|
9 |
# https://discuss.huggingface.co/t/permission-denied-for-writing-files-within-spaces/29799
|
10 |
RUN useradd -m -u 1000 user
|
|
|
31 |
COPY supervisord.conf .
|
32 |
|
33 |
# Set permissions on the log file
|
34 |
+
USER root
|
35 |
RUN touch $HOME/app/mylog.log $HOME/app/supervisord.log && chmod a+rwx $HOME/app/mylog.log $HOME/app/supervisord.log
|
36 |
+
USER user
|
37 |
# RUN mkdir -m 777 -p /.cache/huggingface/hub/
|
38 |
|
39 |
|
app.py
CHANGED
@@ -9,7 +9,7 @@ proj_dir = Path(__name__).parent
|
|
9 |
|
10 |
subreddit = os.environ["SUBREDDIT"]
|
11 |
username = os.environ["USERNAME"]
|
12 |
-
dataset_name = f"{username}/dataset-creator-{subreddit}"
|
13 |
|
14 |
|
15 |
def log_file_to_html_string():
|
@@ -37,7 +37,7 @@ markdown = f"""
|
|
37 |
# Reddit Scraper
|
38 |
This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
|
39 |
|
40 |
-
As shown below this space pulls data from
|
41 |
"""
|
42 |
|
43 |
with gr.Blocks() as demo:
|
|
|
9 |
|
10 |
subreddit = os.environ["SUBREDDIT"]
|
11 |
username = os.environ["USERNAME"]
|
12 |
+
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
13 |
|
14 |
|
15 |
def log_file_to_html_string():
|
|
|
37 |
# Reddit Scraper
|
38 |
This is a reddit scraper which builds [{dataset_name}](https://huggingface.co/datasets/{dataset_name}).
|
39 |
|
40 |
+
As shown below this space pulls data from reddit via [PRAW](https://praw.readthedocs.io/en/stable/), processes it, and puts it in a corresponding dataset.
|
41 |
"""
|
42 |
|
43 |
with gr.Blocks() as demo:
|
archive/subreddit_downloader.py
DELETED
@@ -1,145 +0,0 @@
|
|
1 |
-
import csv
|
2 |
-
import json
|
3 |
-
import sys
|
4 |
-
import time
|
5 |
-
import traceback
|
6 |
-
from datetime import datetime
|
7 |
-
|
8 |
-
import requests
|
9 |
-
|
10 |
-
username = "" # put the username you want to download in the quotes
|
11 |
-
subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
|
12 |
-
thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
|
13 |
-
# leave either one blank to download an entire user's or subreddit's history
|
14 |
-
# or fill in both to download a specific users history from a specific subreddit
|
15 |
-
|
16 |
-
# change this to one of "human", "csv" or "json"
|
17 |
-
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
|
18 |
-
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
|
19 |
-
# - json: the full json object
|
20 |
-
output_format = "csv"
|
21 |
-
|
22 |
-
# default start time is the current time and default end time is all history
|
23 |
-
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
|
24 |
-
# start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
|
25 |
-
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
|
26 |
-
end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
|
27 |
-
|
28 |
-
convert_to_ascii = False # don't touch this unless you know what you're doing
|
29 |
-
convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
|
30 |
-
|
31 |
-
|
32 |
-
def write_csv_line(writer, obj, is_submission):
|
33 |
-
output_list = []
|
34 |
-
output_list.append(str(obj['score']))
|
35 |
-
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
|
36 |
-
if is_submission:
|
37 |
-
output_list.append(obj['title'])
|
38 |
-
output_list.append(f"u/{obj['author']}")
|
39 |
-
output_list.append(f"https://www.reddit.com{obj['permalink']}")
|
40 |
-
if is_submission:
|
41 |
-
if obj['is_self']:
|
42 |
-
if 'selftext' in obj:
|
43 |
-
output_list.append(obj['selftext'])
|
44 |
-
else:
|
45 |
-
output_list.append("")
|
46 |
-
else:
|
47 |
-
output_list.append(obj['url'])
|
48 |
-
else:
|
49 |
-
output_list.append(obj['body'])
|
50 |
-
writer.writerow(output_list)
|
51 |
-
|
52 |
-
|
53 |
-
def write_json_line(handle, obj):
|
54 |
-
handle.write(json.dumps(obj))
|
55 |
-
handle.write("\n")
|
56 |
-
|
57 |
-
|
58 |
-
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
|
59 |
-
print(f"Saving to {filename}")
|
60 |
-
|
61 |
-
count = 0
|
62 |
-
if output_format == "human" or output_format == "json":
|
63 |
-
if convert_to_ascii:
|
64 |
-
handle = open(filename, 'w', encoding='ascii')
|
65 |
-
else:
|
66 |
-
handle = open(filename, 'w', encoding='UTF-8')
|
67 |
-
else:
|
68 |
-
handle = open(filename, 'w', encoding='UTF-8', newline='')
|
69 |
-
writer = csv.writer(handle)
|
70 |
-
|
71 |
-
previous_epoch = int(start_datetime.timestamp())
|
72 |
-
break_out = False
|
73 |
-
while True:
|
74 |
-
new_url = url_base + str(previous_epoch)
|
75 |
-
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
|
76 |
-
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
|
77 |
-
try:
|
78 |
-
json_data = json_text.json()
|
79 |
-
except json.decoder.JSONDecodeError:
|
80 |
-
time.sleep(1)
|
81 |
-
continue
|
82 |
-
|
83 |
-
if 'data' not in json_data:
|
84 |
-
break
|
85 |
-
objects = json_data['data']
|
86 |
-
if len(objects) == 0:
|
87 |
-
break
|
88 |
-
|
89 |
-
for obj in objects:
|
90 |
-
previous_epoch = obj['created_utc'] - 1
|
91 |
-
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
|
92 |
-
break_out = True
|
93 |
-
break
|
94 |
-
count += 1
|
95 |
-
try:
|
96 |
-
if output_format == "csv":
|
97 |
-
write_csv_line(writer, obj, is_submission)
|
98 |
-
elif output_format == "json":
|
99 |
-
write_json_line(handle, obj)
|
100 |
-
except Exception as err:
|
101 |
-
if 'permalink' in obj:
|
102 |
-
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
|
103 |
-
else:
|
104 |
-
print(f"Couldn't print object, missing permalink: {obj['id']}")
|
105 |
-
print(err)
|
106 |
-
print(traceback.format_exc())
|
107 |
-
|
108 |
-
if break_out:
|
109 |
-
break
|
110 |
-
|
111 |
-
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
|
112 |
-
|
113 |
-
print(f"Saved {count}")
|
114 |
-
handle.close()
|
115 |
-
|
116 |
-
|
117 |
-
if __name__ == "__main__":
|
118 |
-
filter_string = None
|
119 |
-
if username == "" and subreddit == "" and thread_id == "":
|
120 |
-
print("Fill in username, subreddit or thread id")
|
121 |
-
sys.exit(0)
|
122 |
-
if output_format not in ("human", "csv", "json"):
|
123 |
-
print("Output format must be one of human, csv, json")
|
124 |
-
sys.exit(0)
|
125 |
-
|
126 |
-
filters = []
|
127 |
-
if username:
|
128 |
-
filters.append(f"author={username}")
|
129 |
-
if subreddit:
|
130 |
-
filters.append(f"subreddit={subreddit}")
|
131 |
-
if thread_id:
|
132 |
-
if convert_thread_id_to_base_ten:
|
133 |
-
filters.append(f"link_id={int(thread_id, 36)}")
|
134 |
-
else:
|
135 |
-
filters.append(f"link_id=t3_{thread_id}")
|
136 |
-
filter_string = '&'.join(filters)
|
137 |
-
|
138 |
-
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
|
139 |
-
|
140 |
-
if not thread_id:
|
141 |
-
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
|
142 |
-
end_time, True, convert_to_ascii)
|
143 |
-
# download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
|
144 |
-
# end_time, False, convert_to_ascii)
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -3,17 +3,18 @@ import time
|
|
3 |
from datetime import datetime, timedelta
|
4 |
|
5 |
import pandas as pd
|
6 |
-
|
|
|
7 |
from huggingface_hub import login
|
8 |
|
9 |
-
from
|
10 |
-
from utilities.
|
11 |
from utilities.readme_update import update_readme
|
12 |
|
13 |
# Set dataset name, path to README.md, and existing dataset details
|
14 |
subreddit = os.environ["SUBREDDIT"]
|
15 |
username = os.environ["USERNAME"]
|
16 |
-
dataset_name = f"{username}/dataset-creator-{subreddit}"
|
17 |
dataset_readme_path = "README.md"
|
18 |
|
19 |
# Authenticate with Hugging Face using an auth token
|
@@ -23,94 +24,6 @@ login(auth_token, add_to_git_credential=True)
|
|
23 |
logger = setup_logger(__name__)
|
24 |
|
25 |
|
26 |
-
def main(dataset, date_to_fetch):
|
27 |
-
"""
|
28 |
-
Runs the main data processing function to fetch and process subreddit data for the specified date.
|
29 |
-
|
30 |
-
Args:
|
31 |
-
dataset (datasets.DatasetDict): The Hugging Face dataset to fetch and process subreddit data for.
|
32 |
-
date_to_fetch (str): The date to fetch subreddit data for, in YYYY-MM-DD format.
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
most_recent_date (str): The most recent date in the updated dataset.
|
36 |
-
"""
|
37 |
-
# Call get_subreddit_day with the calculated date
|
38 |
-
logger.info(f"Fetching data for {str(date_to_fetch)}")
|
39 |
-
submissions = scrape_submissions_by_day(subreddit, str(date_to_fetch))
|
40 |
-
df = submissions_to_dataframe(submissions)
|
41 |
-
logger.debug(f"Data fetched for {str(date_to_fetch)}")
|
42 |
-
most_recent_date = date_to_fetch
|
43 |
-
|
44 |
-
# Append DataFrame to split 'all_days' or create new split
|
45 |
-
if "all_days" in dataset:
|
46 |
-
logger.debug("Appending data to split 'all_days'")
|
47 |
-
# Merge the new submissions
|
48 |
-
old_data = dataset['all_days'].to_pandas()
|
49 |
-
new_data = pd.concat([old_data, df], ignore_index=True)
|
50 |
-
if '__index_level_0__' in new_data.columns:
|
51 |
-
new_data = new_data.drop('__index_level_0__', axis=1)
|
52 |
-
|
53 |
-
# Drop duplicates just in case
|
54 |
-
new_data = new_data.drop_duplicates(subset=['id'], keep="first")
|
55 |
-
|
56 |
-
# Figure out dates when we restart
|
57 |
-
old_data_most_recent_date = old_data['date'].max()
|
58 |
-
old_data_most_recent_date = datetime.strptime(old_data_most_recent_date, '%Y-%m-%d').date()
|
59 |
-
most_recent_date = max(old_data_most_recent_date, most_recent_date)
|
60 |
-
|
61 |
-
if len(old_data) == len(new_data):
|
62 |
-
logger.warning("Data in hub is much more recent, using that next!")
|
63 |
-
return most_recent_date
|
64 |
-
|
65 |
-
# Convert back to dataset
|
66 |
-
dataset["all_days"] = Dataset.from_pandas(new_data)
|
67 |
-
|
68 |
-
# Update README
|
69 |
-
update_readme(dataset_name, subreddit, date_to_fetch)
|
70 |
-
else:
|
71 |
-
logger.debug("Creating new split 'all_days'")
|
72 |
-
dataset["all_days"] = Dataset.from_pandas(df)
|
73 |
-
# Log appending or creating split 'all'
|
74 |
-
logger.debug("Appended or created split 'all_days'")
|
75 |
-
|
76 |
-
# Push the augmented dataset to the Hugging Face hub
|
77 |
-
logger.debug(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
|
78 |
-
dataset.push_to_hub(dataset_name, token=auth_token)
|
79 |
-
logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
|
80 |
-
return most_recent_date
|
81 |
-
|
82 |
-
|
83 |
-
def run_main_continuously():
|
84 |
-
"""
|
85 |
-
This function runs the given `main_function` continuously, starting from the date specified
|
86 |
-
in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
|
87 |
-
it will wait until tomorrow to start again at the same time as when it started today.
|
88 |
-
"""
|
89 |
-
start_date_str = os.environ.get("START_DATE")
|
90 |
-
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
|
91 |
-
|
92 |
-
# Calculate the start time for running the main_function every day.
|
93 |
-
start_time = datetime.now().time()
|
94 |
-
|
95 |
-
dataset = get_dataset()
|
96 |
-
|
97 |
-
while True:
|
98 |
-
today = datetime.now().date()
|
99 |
-
two_days_ago = today - timedelta(days=2)
|
100 |
-
|
101 |
-
if start_date <= two_days_ago:
|
102 |
-
logger.warning(f"Running main function for date: {start_date}")
|
103 |
-
most_recent_date = main(dataset, start_date)
|
104 |
-
start_date = most_recent_date + timedelta(days=1)
|
105 |
-
else:
|
106 |
-
tomorrow = today + timedelta(days=1)
|
107 |
-
now = datetime.now()
|
108 |
-
start_of_tomorrow = datetime.combine(tomorrow, start_time)
|
109 |
-
wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
|
110 |
-
logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
|
111 |
-
time.sleep(wait_until_tomorrow)
|
112 |
-
|
113 |
-
|
114 |
def get_dataset():
|
115 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
116 |
try:
|
@@ -124,5 +37,38 @@ def get_dataset():
|
|
124 |
return dataset
|
125 |
|
126 |
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from datetime import datetime, timedelta
|
4 |
|
5 |
import pandas as pd
|
6 |
+
import schedule
|
7 |
+
from datasets import DatasetDict, load_dataset, Dataset
|
8 |
from huggingface_hub import login
|
9 |
|
10 |
+
from utilities.data_collator import merge_and_filter_data
|
11 |
+
from utilities.my_logger import setup_logger
|
12 |
from utilities.readme_update import update_readme
|
13 |
|
14 |
# Set dataset name, path to README.md, and existing dataset details
|
15 |
subreddit = os.environ["SUBREDDIT"]
|
16 |
username = os.environ["USERNAME"]
|
17 |
+
dataset_name = f"{username}/dataset-creator-reddit-{subreddit}"
|
18 |
dataset_readme_path = "README.md"
|
19 |
|
20 |
# Authenticate with Hugging Face using an auth token
|
|
|
24 |
logger = setup_logger(__name__)
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def get_dataset():
|
28 |
# Load the existing dataset from the Hugging Face hub or create a new one
|
29 |
try:
|
|
|
37 |
return dataset
|
38 |
|
39 |
|
40 |
+
def main():
|
41 |
+
date = datetime.now().strftime('%Y-%m-%d')
|
42 |
+
logger.warning(f"Running main function for date: {date}")
|
43 |
+
dataset = get_dataset()
|
44 |
+
|
45 |
+
# Get Latest Data and merge with historic data
|
46 |
+
old_df = dataset['train'].to_pandas() if 'train' in dataset.keys() else pd.DataFrame()
|
47 |
+
new_df = merge_and_filter_data(old_df=old_df)
|
48 |
+
dataset['train'] = Dataset.from_pandas(new_df, preserve_index=False)
|
49 |
+
|
50 |
+
# Update README
|
51 |
+
update_readme(dataset_name=dataset_name, subreddit=subreddit, latest_date=date)
|
52 |
+
|
53 |
+
# Push the augmented dataset to the Hugging Face hub
|
54 |
+
logger.debug(f"Pushing data for {date} to the Hugging Face hub")
|
55 |
+
dataset.push_to_hub(dataset_name, token=auth_token)
|
56 |
+
logger.info(f"Processed and pushed data for {date} to the Hugging Face Hub")
|
57 |
+
|
58 |
+
|
59 |
+
def schedule_daily_task():
|
60 |
+
"""
|
61 |
+
Schedule the daily_task to run at the specific time every day.
|
62 |
+
"""
|
63 |
+
start_time = (datetime.now() + timedelta(seconds=5)).time().strftime('%H:%M') # Now + 30 seconds
|
64 |
+
logger.info(f'Scheduling tasks to run every day at: {start_time}')
|
65 |
+
main()
|
66 |
+
schedule.every().day.at(start_time).do(main)
|
67 |
+
|
68 |
+
while True:
|
69 |
+
schedule.run_pending()
|
70 |
+
time.sleep(1)
|
71 |
+
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
schedule_daily_task()
|
media/reddit_scraper.drawio.html
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
<!--[if IE]><meta http-equiv="X-UA-Compatible" content="IE=5,IE=9" ><![endif]-->
|
2 |
-
<!DOCTYPE html>
|
3 |
-
<html>
|
4 |
-
<head>
|
5 |
-
<title>reddit_scraper</title>
|
6 |
-
<meta charset="utf-8"/>
|
7 |
-
</head>
|
8 |
-
<body><div class="mxgraph" style="max-width:100%;border:1px solid transparent;" data-mxgraph="{"highlight":"#0000ff","nav":true,"resize":true,"toolbar":"zoom layers tags lightbox","edit":"_blank","xml":"<mxfile host=\"app.diagrams.net\" modified=\"2023-04-14T12:12:14.014Z\" agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36\" etag=\"puEjOIZigDmpONhGThsE\" version=\"21.1.7\" type=\"device\">\n <diagram name=\"Page-1\" id=\"14ddc1Tw5ZQC4xUkB2ri\">\n <mxGraphModel dx=\"1034\" dy=\"783\" grid=\"1\" gridSize=\"10\" guides=\"1\" tooltips=\"1\" connect=\"1\" arrows=\"1\" fold=\"1\" page=\"1\" pageScale=\"1\" pageWidth=\"850\" pageHeight=\"1100\" math=\"0\" shadow=\"0\">\n <root>\n <mxCell id=\"0\" />\n <mxCell id=\"1\" parent=\"0\" />\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-3\" value=\"\" style=\"edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;\" edge=\"1\" parent=\"1\" source=\"KhBTRBst3V2Bs5u7l5Na-1\" target=\"KhBTRBst3V2Bs5u7l5Na-2\">\n <mxGeometry relative=\"1\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-7\" value=\"HF API\" style=\"edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\" vertex=\"1\" connectable=\"0\" parent=\"KhBTRBst3V2Bs5u7l5Na-3\">\n <mxGeometry x=\"-0.125\" y=\"1\" relative=\"1\" as=\"geometry\">\n <mxPoint as=\"offset\" />\n </mxGeometry>\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-1\" value=\"HF SPACE&lt;br&gt;&lt;a href=&quot;SPACE_LINK&quot;&gt;SPACE_NAME&lt;/a&gt;\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"340\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-2\" value=\"HF DATASET &lt;br&gt;&lt;a href=&quot;DATASET_LINK&quot;&gt;DATASET_NAME&lt;/a&gt;\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"110\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-4\" value=\"&lt;a href=&quot;pushshift.io&quot;&gt;Pushshift.io&lt;/a&gt;&lt;br&gt;Hosts Reddit Data\" style=\"rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;\" vertex=\"1\" parent=\"1\">\n <mxGeometry x=\"590\" y=\"360\" width=\"160\" height=\"80\" as=\"geometry\" />\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-8\" value=\"\" style=\"endArrow=classic;startArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;\" edge=\"1\" parent=\"1\" source=\"KhBTRBst3V2Bs5u7l5Na-1\" target=\"KhBTRBst3V2Bs5u7l5Na-4\">\n <mxGeometry width=\"50\" height=\"50\" relative=\"1\" as=\"geometry\">\n <mxPoint x=\"470\" y=\"530\" as=\"sourcePoint\" />\n <mxPoint x=\"520\" y=\"480\" as=\"targetPoint\" />\n </mxGeometry>\n </mxCell>\n <mxCell id=\"KhBTRBst3V2Bs5u7l5Na-9\" value=\"HTTP\" style=\"edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];\" vertex=\"1\" connectable=\"0\" parent=\"KhBTRBst3V2Bs5u7l5Na-8\">\n <mxGeometry x=\"0.225\" y=\"1\" relative=\"1\" as=\"geometry\">\n <mxPoint x=\"-9\" y=\"1\" as=\"offset\" />\n </mxGeometry>\n </mxCell>\n </root>\n </mxGraphModel>\n </diagram>\n</mxfile>\n"}"></div>
|
9 |
-
<script type="text/javascript" src="https://viewer.diagrams.net/js/viewer-static.min.js"></script>
|
10 |
-
</body>
|
11 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
media/reddit_scraper.drawio.png
CHANGED
notebooks/data_processing.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/explore.ipynb
DELETED
@@ -1,323 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": null,
|
6 |
-
"id": "730ba509",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"from IPython.core.interactiveshell import InteractiveShell\n",
|
11 |
-
"InteractiveShell.ast_node_interactivity = \"all\""
|
12 |
-
]
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"cell_type": "code",
|
16 |
-
"execution_count": null,
|
17 |
-
"id": "d9acd4b6",
|
18 |
-
"metadata": {},
|
19 |
-
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"from pathlib import Path\n",
|
22 |
-
"import sys\n",
|
23 |
-
"proj_dir = Path.cwd().parent\n",
|
24 |
-
"\n",
|
25 |
-
"sys.path.append(str(proj_dir))\n"
|
26 |
-
]
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"cell_type": "code",
|
30 |
-
"execution_count": null,
|
31 |
-
"id": "62452860",
|
32 |
-
"metadata": {},
|
33 |
-
"outputs": [],
|
34 |
-
"source": [
|
35 |
-
"from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
|
36 |
-
]
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"cell_type": "code",
|
40 |
-
"execution_count": 4,
|
41 |
-
"id": "a956a623",
|
42 |
-
"metadata": {},
|
43 |
-
"outputs": [
|
44 |
-
{
|
45 |
-
"data": {
|
46 |
-
"application/vnd.jupyter.widget-view+json": {
|
47 |
-
"model_id": "17df3f2812084d3591e914ffcfd948b0",
|
48 |
-
"version_major": 2,
|
49 |
-
"version_minor": 0
|
50 |
-
},
|
51 |
-
"text/plain": [
|
52 |
-
"0it [00:00, ?it/s]"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
"metadata": {},
|
56 |
-
"output_type": "display_data"
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"name": "stderr",
|
60 |
-
"output_type": "stream",
|
61 |
-
"text": [
|
62 |
-
"2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
|
63 |
-
"2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
|
64 |
-
"2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
|
65 |
-
"2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
|
66 |
-
"2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
|
67 |
-
"2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
|
68 |
-
"2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
|
69 |
-
]
|
70 |
-
}
|
71 |
-
],
|
72 |
-
"source": [
|
73 |
-
"subreddit_to_scrape = \"askreddit\"\n",
|
74 |
-
"day_to_scrape = \"2013-03-01\"\n",
|
75 |
-
"submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
|
76 |
-
]
|
77 |
-
},
|
78 |
-
{
|
79 |
-
"cell_type": "code",
|
80 |
-
"execution_count": 5,
|
81 |
-
"id": "b1cc845b",
|
82 |
-
"metadata": {},
|
83 |
-
"outputs": [
|
84 |
-
{
|
85 |
-
"data": {
|
86 |
-
"text/html": [
|
87 |
-
"<div>\n",
|
88 |
-
"<style scoped>\n",
|
89 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
90 |
-
" vertical-align: middle;\n",
|
91 |
-
" }\n",
|
92 |
-
"\n",
|
93 |
-
" .dataframe tbody tr th {\n",
|
94 |
-
" vertical-align: top;\n",
|
95 |
-
" }\n",
|
96 |
-
"\n",
|
97 |
-
" .dataframe thead th {\n",
|
98 |
-
" text-align: right;\n",
|
99 |
-
" }\n",
|
100 |
-
"</style>\n",
|
101 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
102 |
-
" <thead>\n",
|
103 |
-
" <tr style=\"text-align: right;\">\n",
|
104 |
-
" <th></th>\n",
|
105 |
-
" <th>permalink</th>\n",
|
106 |
-
" <th>selftext</th>\n",
|
107 |
-
" <th>url</th>\n",
|
108 |
-
" <th>created_utc</th>\n",
|
109 |
-
" <th>author</th>\n",
|
110 |
-
" <th>num_comments</th>\n",
|
111 |
-
" <th>score</th>\n",
|
112 |
-
" <th>title</th>\n",
|
113 |
-
" <th>id</th>\n",
|
114 |
-
" <th>downs</th>\n",
|
115 |
-
" <th>ups</th>\n",
|
116 |
-
" </tr>\n",
|
117 |
-
" </thead>\n",
|
118 |
-
" <tbody>\n",
|
119 |
-
" <tr>\n",
|
120 |
-
" <th>0</th>\n",
|
121 |
-
" <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
|
122 |
-
" <td>Basically, do other parts of the world have th...</td>\n",
|
123 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
124 |
-
" <td>2013-03-01 19:58:55</td>\n",
|
125 |
-
" <td>sjr63</td>\n",
|
126 |
-
" <td>1</td>\n",
|
127 |
-
" <td>1</td>\n",
|
128 |
-
" <td>In the way that popular English and American m...</td>\n",
|
129 |
-
" <td>19hbm0</td>\n",
|
130 |
-
" <td>0</td>\n",
|
131 |
-
" <td>1</td>\n",
|
132 |
-
" </tr>\n",
|
133 |
-
" <tr>\n",
|
134 |
-
" <th>1</th>\n",
|
135 |
-
" <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
|
136 |
-
" <td></td>\n",
|
137 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
138 |
-
" <td>2013-03-01 19:58:50</td>\n",
|
139 |
-
" <td>WeirdPlane</td>\n",
|
140 |
-
" <td>13</td>\n",
|
141 |
-
" <td>1</td>\n",
|
142 |
-
" <td>Could I buy an Android phone without a plan an...</td>\n",
|
143 |
-
" <td>19hblp</td>\n",
|
144 |
-
" <td>0</td>\n",
|
145 |
-
" <td>1</td>\n",
|
146 |
-
" </tr>\n",
|
147 |
-
" <tr>\n",
|
148 |
-
" <th>2</th>\n",
|
149 |
-
" <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
|
150 |
-
" <td>Yeah.\n",
|
151 |
-
"\n",
|
152 |
-
"How do I reddit? I don't use or read re...</td>\n",
|
153 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
154 |
-
" <td>2013-03-01 19:58:47</td>\n",
|
155 |
-
" <td>xxnovaroxgg</td>\n",
|
156 |
-
" <td>14</td>\n",
|
157 |
-
" <td>0</td>\n",
|
158 |
-
" <td>How do I reddit</td>\n",
|
159 |
-
" <td>19hblj</td>\n",
|
160 |
-
" <td>0</td>\n",
|
161 |
-
" <td>0</td>\n",
|
162 |
-
" </tr>\n",
|
163 |
-
" <tr>\n",
|
164 |
-
" <th>3</th>\n",
|
165 |
-
" <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
|
166 |
-
" <td>My brothers, dad and I have always been huge L...</td>\n",
|
167 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
168 |
-
" <td>2013-03-01 19:58:07</td>\n",
|
169 |
-
" <td>tuffstough</td>\n",
|
170 |
-
" <td>0</td>\n",
|
171 |
-
" <td>1</td>\n",
|
172 |
-
" <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
|
173 |
-
" <td>19hbjx</td>\n",
|
174 |
-
" <td>0</td>\n",
|
175 |
-
" <td>1</td>\n",
|
176 |
-
" </tr>\n",
|
177 |
-
" <tr>\n",
|
178 |
-
" <th>4</th>\n",
|
179 |
-
" <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
|
180 |
-
" <td>I'm curious, guys tend to get asked the usual ...</td>\n",
|
181 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
182 |
-
" <td>2013-03-01 19:57:58</td>\n",
|
183 |
-
" <td>redditredditx3</td>\n",
|
184 |
-
" <td>13</td>\n",
|
185 |
-
" <td>2</td>\n",
|
186 |
-
" <td>Female Redditors, which part of the male physi...</td>\n",
|
187 |
-
" <td>19hbjk</td>\n",
|
188 |
-
" <td>0</td>\n",
|
189 |
-
" <td>2</td>\n",
|
190 |
-
" </tr>\n",
|
191 |
-
" </tbody>\n",
|
192 |
-
"</table>\n",
|
193 |
-
"</div>"
|
194 |
-
],
|
195 |
-
"text/plain": [
|
196 |
-
" permalink \\\n",
|
197 |
-
"0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n",
|
198 |
-
"1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n",
|
199 |
-
"2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n",
|
200 |
-
"3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n",
|
201 |
-
"4 /r/AskReddit/comments/19hbjk/female_redditors_... \n",
|
202 |
-
"\n",
|
203 |
-
" selftext \\\n",
|
204 |
-
"0 Basically, do other parts of the world have th... \n",
|
205 |
-
"1 \n",
|
206 |
-
"2 Yeah.\n",
|
207 |
-
"\n",
|
208 |
-
"How do I reddit? I don't use or read re... \n",
|
209 |
-
"3 My brothers, dad and I have always been huge L... \n",
|
210 |
-
"4 I'm curious, guys tend to get asked the usual ... \n",
|
211 |
-
"\n",
|
212 |
-
" url created_utc \\\n",
|
213 |
-
"0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n",
|
214 |
-
"1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n",
|
215 |
-
"2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n",
|
216 |
-
"3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n",
|
217 |
-
"4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n",
|
218 |
-
"\n",
|
219 |
-
" author num_comments score \\\n",
|
220 |
-
"0 sjr63 1 1 \n",
|
221 |
-
"1 WeirdPlane 13 1 \n",
|
222 |
-
"2 xxnovaroxgg 14 0 \n",
|
223 |
-
"3 tuffstough 0 1 \n",
|
224 |
-
"4 redditredditx3 13 2 \n",
|
225 |
-
"\n",
|
226 |
-
" title id downs ups \n",
|
227 |
-
"0 In the way that popular English and American m... 19hbm0 0 1 \n",
|
228 |
-
"1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n",
|
229 |
-
"2 How do I reddit 19hblj 0 0 \n",
|
230 |
-
"3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n",
|
231 |
-
"4 Female Redditors, which part of the male physi... 19hbjk 0 2 "
|
232 |
-
]
|
233 |
-
},
|
234 |
-
"execution_count": 5,
|
235 |
-
"metadata": {},
|
236 |
-
"output_type": "execute_result"
|
237 |
-
}
|
238 |
-
],
|
239 |
-
"source": [
|
240 |
-
"df = submissions_to_dataframe(submissions)\n",
|
241 |
-
"df.head()"
|
242 |
-
]
|
243 |
-
},
|
244 |
-
{
|
245 |
-
"cell_type": "code",
|
246 |
-
"execution_count": null,
|
247 |
-
"id": "518addff",
|
248 |
-
"metadata": {},
|
249 |
-
"outputs": [],
|
250 |
-
"source": []
|
251 |
-
},
|
252 |
-
{
|
253 |
-
"cell_type": "code",
|
254 |
-
"execution_count": null,
|
255 |
-
"id": "6e5490dc",
|
256 |
-
"metadata": {},
|
257 |
-
"outputs": [],
|
258 |
-
"source": [
|
259 |
-
"start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
|
260 |
-
"start_date"
|
261 |
-
]
|
262 |
-
},
|
263 |
-
{
|
264 |
-
"cell_type": "code",
|
265 |
-
"execution_count": null,
|
266 |
-
"id": "bf13555a",
|
267 |
-
"metadata": {},
|
268 |
-
"outputs": [],
|
269 |
-
"source": [
|
270 |
-
"df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
|
271 |
-
]
|
272 |
-
},
|
273 |
-
{
|
274 |
-
"cell_type": "code",
|
275 |
-
"execution_count": null,
|
276 |
-
"id": "48e413f3",
|
277 |
-
"metadata": {},
|
278 |
-
"outputs": [],
|
279 |
-
"source": [
|
280 |
-
"df.head()"
|
281 |
-
]
|
282 |
-
},
|
283 |
-
{
|
284 |
-
"cell_type": "code",
|
285 |
-
"execution_count": null,
|
286 |
-
"id": "9e83befa",
|
287 |
-
"metadata": {},
|
288 |
-
"outputs": [],
|
289 |
-
"source": [
|
290 |
-
"df.dtypes"
|
291 |
-
]
|
292 |
-
},
|
293 |
-
{
|
294 |
-
"cell_type": "code",
|
295 |
-
"execution_count": null,
|
296 |
-
"id": "ba84be68",
|
297 |
-
"metadata": {},
|
298 |
-
"outputs": [],
|
299 |
-
"source": []
|
300 |
-
}
|
301 |
-
],
|
302 |
-
"metadata": {
|
303 |
-
"kernelspec": {
|
304 |
-
"display_name": "Python 3 (ipykernel)",
|
305 |
-
"language": "python",
|
306 |
-
"name": "python3"
|
307 |
-
},
|
308 |
-
"language_info": {
|
309 |
-
"codemirror_mode": {
|
310 |
-
"name": "ipython",
|
311 |
-
"version": 3
|
312 |
-
},
|
313 |
-
"file_extension": ".py",
|
314 |
-
"mimetype": "text/x-python",
|
315 |
-
"name": "python",
|
316 |
-
"nbconvert_exporter": "python",
|
317 |
-
"pygments_lexer": "ipython3",
|
318 |
-
"version": "3.9.16"
|
319 |
-
}
|
320 |
-
},
|
321 |
-
"nbformat": 4,
|
322 |
-
"nbformat_minor": 5
|
323 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/validate.ipynb
DELETED
@@ -1,617 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 1,
|
6 |
-
"id": "730ba509",
|
7 |
-
"metadata": {},
|
8 |
-
"outputs": [],
|
9 |
-
"source": [
|
10 |
-
"from IPython.core.interactiveshell import InteractiveShell\n",
|
11 |
-
"InteractiveShell.ast_node_interactivity = \"all\""
|
12 |
-
]
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"cell_type": "code",
|
16 |
-
"execution_count": 2,
|
17 |
-
"id": "d9acd4b6",
|
18 |
-
"metadata": {},
|
19 |
-
"outputs": [],
|
20 |
-
"source": [
|
21 |
-
"from pathlib import Path\n",
|
22 |
-
"import sys\n",
|
23 |
-
"proj_dir = Path.cwd().parent\n",
|
24 |
-
"\n",
|
25 |
-
"sys.path.append(str(proj_dir))\n"
|
26 |
-
]
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"cell_type": "code",
|
30 |
-
"execution_count": 4,
|
31 |
-
"id": "62452860",
|
32 |
-
"metadata": {},
|
33 |
-
"outputs": [],
|
34 |
-
"source": [
|
35 |
-
"from datasets import load_dataset"
|
36 |
-
]
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"cell_type": "code",
|
40 |
-
"execution_count": 28,
|
41 |
-
"id": "00affc9a",
|
42 |
-
"metadata": {},
|
43 |
-
"outputs": [
|
44 |
-
{
|
45 |
-
"data": {
|
46 |
-
"application/vnd.jupyter.widget-view+json": {
|
47 |
-
"model_id": "a106bb47c1194b15bc289d2ef24258af",
|
48 |
-
"version_major": 2,
|
49 |
-
"version_minor": 0
|
50 |
-
},
|
51 |
-
"text/plain": [
|
52 |
-
"Downloading readme: 0%| | 0.00/804 [00:00<?, ?B/s]"
|
53 |
-
]
|
54 |
-
},
|
55 |
-
"metadata": {},
|
56 |
-
"output_type": "display_data"
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"name": "stderr",
|
60 |
-
"output_type": "stream",
|
61 |
-
"text": [
|
62 |
-
"Using custom data configuration derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16\n"
|
63 |
-
]
|
64 |
-
},
|
65 |
-
{
|
66 |
-
"name": "stdout",
|
67 |
-
"output_type": "stream",
|
68 |
-
"text": [
|
69 |
-
"Downloading and preparing dataset None/None to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"data": {
|
74 |
-
"application/vnd.jupyter.widget-view+json": {
|
75 |
-
"model_id": "705d55e70bf442f98a51dd0618a5c2c6",
|
76 |
-
"version_major": 2,
|
77 |
-
"version_minor": 0
|
78 |
-
},
|
79 |
-
"text/plain": [
|
80 |
-
"Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
81 |
-
]
|
82 |
-
},
|
83 |
-
"metadata": {},
|
84 |
-
"output_type": "display_data"
|
85 |
-
},
|
86 |
-
{
|
87 |
-
"data": {
|
88 |
-
"application/vnd.jupyter.widget-view+json": {
|
89 |
-
"model_id": "139220a81674444997f7657a4c2e1a01",
|
90 |
-
"version_major": 2,
|
91 |
-
"version_minor": 0
|
92 |
-
},
|
93 |
-
"text/plain": [
|
94 |
-
"Downloading data: 0%| | 0.00/702k [00:00<?, ?B/s]"
|
95 |
-
]
|
96 |
-
},
|
97 |
-
"metadata": {},
|
98 |
-
"output_type": "display_data"
|
99 |
-
},
|
100 |
-
{
|
101 |
-
"data": {
|
102 |
-
"application/vnd.jupyter.widget-view+json": {
|
103 |
-
"model_id": "1a361406937144cebd4ff6168e56ec3d",
|
104 |
-
"version_major": 2,
|
105 |
-
"version_minor": 0
|
106 |
-
},
|
107 |
-
"text/plain": [
|
108 |
-
"Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]"
|
109 |
-
]
|
110 |
-
},
|
111 |
-
"metadata": {},
|
112 |
-
"output_type": "display_data"
|
113 |
-
},
|
114 |
-
{
|
115 |
-
"data": {
|
116 |
-
"application/vnd.jupyter.widget-view+json": {
|
117 |
-
"model_id": "",
|
118 |
-
"version_major": 2,
|
119 |
-
"version_minor": 0
|
120 |
-
},
|
121 |
-
"text/plain": [
|
122 |
-
"Generating all_days split: 0%| | 0/3272 [00:00<?, ? examples/s]"
|
123 |
-
]
|
124 |
-
},
|
125 |
-
"metadata": {},
|
126 |
-
"output_type": "display_data"
|
127 |
-
},
|
128 |
-
{
|
129 |
-
"name": "stdout",
|
130 |
-
"output_type": "stream",
|
131 |
-
"text": [
|
132 |
-
"Dataset parquet downloaded and prepared to /Users/derekthomas/.cache/huggingface/datasets/derek-thomas___parquet/derek-thomas--dataset-creator-askreddit-a3c1289ebaf83d16/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
|
133 |
-
]
|
134 |
-
},
|
135 |
-
{
|
136 |
-
"data": {
|
137 |
-
"application/vnd.jupyter.widget-view+json": {
|
138 |
-
"model_id": "4df7107473904386aebd66c543858abd",
|
139 |
-
"version_major": 2,
|
140 |
-
"version_minor": 0
|
141 |
-
},
|
142 |
-
"text/plain": [
|
143 |
-
" 0%| | 0/1 [00:00<?, ?it/s]"
|
144 |
-
]
|
145 |
-
},
|
146 |
-
"metadata": {},
|
147 |
-
"output_type": "display_data"
|
148 |
-
}
|
149 |
-
],
|
150 |
-
"source": [
|
151 |
-
"dataset = load_dataset('derek-thomas/dataset-creator-askreddit', download_mode=\"reuse_cache_if_exists\", ignore_verifications=True)"
|
152 |
-
]
|
153 |
-
},
|
154 |
-
{
|
155 |
-
"cell_type": "code",
|
156 |
-
"execution_count": 29,
|
157 |
-
"id": "ba84be68",
|
158 |
-
"metadata": {},
|
159 |
-
"outputs": [
|
160 |
-
{
|
161 |
-
"data": {
|
162 |
-
"text/html": [
|
163 |
-
"<div>\n",
|
164 |
-
"<style scoped>\n",
|
165 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
166 |
-
" vertical-align: middle;\n",
|
167 |
-
" }\n",
|
168 |
-
"\n",
|
169 |
-
" .dataframe tbody tr th {\n",
|
170 |
-
" vertical-align: top;\n",
|
171 |
-
" }\n",
|
172 |
-
"\n",
|
173 |
-
" .dataframe thead th {\n",
|
174 |
-
" text-align: right;\n",
|
175 |
-
" }\n",
|
176 |
-
"</style>\n",
|
177 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
178 |
-
" <thead>\n",
|
179 |
-
" <tr style=\"text-align: right;\">\n",
|
180 |
-
" <th></th>\n",
|
181 |
-
" <th>score</th>\n",
|
182 |
-
" <th>num_comments</th>\n",
|
183 |
-
" <th>title</th>\n",
|
184 |
-
" <th>permalink</th>\n",
|
185 |
-
" <th>selftext</th>\n",
|
186 |
-
" <th>url</th>\n",
|
187 |
-
" <th>created_utc</th>\n",
|
188 |
-
" <th>author</th>\n",
|
189 |
-
" <th>id</th>\n",
|
190 |
-
" <th>downs</th>\n",
|
191 |
-
" <th>ups</th>\n",
|
192 |
-
" <th>date</th>\n",
|
193 |
-
" <th>time</th>\n",
|
194 |
-
" </tr>\n",
|
195 |
-
" </thead>\n",
|
196 |
-
" <tbody>\n",
|
197 |
-
" <tr>\n",
|
198 |
-
" <th>0</th>\n",
|
199 |
-
" <td>2</td>\n",
|
200 |
-
" <td>4</td>\n",
|
201 |
-
" <td>Reddit, if someone had to describe you to a st...</td>\n",
|
202 |
-
" <td>/r/AskReddit/comments/15sn6y/reddit_if_someone...</td>\n",
|
203 |
-
" <td>They would be talking about you without your p...</td>\n",
|
204 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
205 |
-
" <td>2013-01-01 23:59:40+00:00</td>\n",
|
206 |
-
" <td>[deleted]</td>\n",
|
207 |
-
" <td>15sn6y</td>\n",
|
208 |
-
" <td>0</td>\n",
|
209 |
-
" <td>2</td>\n",
|
210 |
-
" <td>2013-01-01</td>\n",
|
211 |
-
" <td>23:59:40</td>\n",
|
212 |
-
" </tr>\n",
|
213 |
-
" <tr>\n",
|
214 |
-
" <th>1</th>\n",
|
215 |
-
" <td>5</td>\n",
|
216 |
-
" <td>24</td>\n",
|
217 |
-
" <td>What kind of car does the average \\nRedditor d...</td>\n",
|
218 |
-
" <td>/r/AskReddit/comments/15sn6m/what_kind_of_car_...</td>\n",
|
219 |
-
" <td>I've always wanted to know what kind of car th...</td>\n",
|
220 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
221 |
-
" <td>2013-01-01 23:59:31+00:00</td>\n",
|
222 |
-
" <td>PaytonAdams</td>\n",
|
223 |
-
" <td>15sn6m</td>\n",
|
224 |
-
" <td>0</td>\n",
|
225 |
-
" <td>5</td>\n",
|
226 |
-
" <td>2013-01-01</td>\n",
|
227 |
-
" <td>23:59:31</td>\n",
|
228 |
-
" </tr>\n",
|
229 |
-
" <tr>\n",
|
230 |
-
" <th>2</th>\n",
|
231 |
-
" <td>1</td>\n",
|
232 |
-
" <td>5</td>\n",
|
233 |
-
" <td>What movies have made you go back to the theat...</td>\n",
|
234 |
-
" <td>/r/AskReddit/comments/15sn6b/what_movies_have_...</td>\n",
|
235 |
-
" <td></td>\n",
|
236 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
237 |
-
" <td>2013-01-01 23:59:20+00:00</td>\n",
|
238 |
-
" <td>[deleted]</td>\n",
|
239 |
-
" <td>15sn6b</td>\n",
|
240 |
-
" <td>0</td>\n",
|
241 |
-
" <td>1</td>\n",
|
242 |
-
" <td>2013-01-01</td>\n",
|
243 |
-
" <td>23:59:20</td>\n",
|
244 |
-
" </tr>\n",
|
245 |
-
" <tr>\n",
|
246 |
-
" <th>3</th>\n",
|
247 |
-
" <td>0</td>\n",
|
248 |
-
" <td>18</td>\n",
|
249 |
-
" <td>Worst fear(s)?</td>\n",
|
250 |
-
" <td>/r/AskReddit/comments/15sn4u/worst_fears/</td>\n",
|
251 |
-
" <td>So what is your worst fear, reddit?</td>\n",
|
252 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
253 |
-
" <td>2013-01-01 23:58:37+00:00</td>\n",
|
254 |
-
" <td>[deleted]</td>\n",
|
255 |
-
" <td>15sn4u</td>\n",
|
256 |
-
" <td>0</td>\n",
|
257 |
-
" <td>0</td>\n",
|
258 |
-
" <td>2013-01-01</td>\n",
|
259 |
-
" <td>23:58:37</td>\n",
|
260 |
-
" </tr>\n",
|
261 |
-
" <tr>\n",
|
262 |
-
" <th>4</th>\n",
|
263 |
-
" <td>11</td>\n",
|
264 |
-
" <td>29</td>\n",
|
265 |
-
" <td>If there was a type of ink that lasted only fo...</td>\n",
|
266 |
-
" <td>/r/AskReddit/comments/15sn44/if_there_was_a_ty...</td>\n",
|
267 |
-
" <td></td>\n",
|
268 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15s...</td>\n",
|
269 |
-
" <td>2013-01-01 23:58:15+00:00</td>\n",
|
270 |
-
" <td>Honeybeard</td>\n",
|
271 |
-
" <td>15sn44</td>\n",
|
272 |
-
" <td>0</td>\n",
|
273 |
-
" <td>11</td>\n",
|
274 |
-
" <td>2013-01-01</td>\n",
|
275 |
-
" <td>23:58:15</td>\n",
|
276 |
-
" </tr>\n",
|
277 |
-
" <tr>\n",
|
278 |
-
" <th>...</th>\n",
|
279 |
-
" <td>...</td>\n",
|
280 |
-
" <td>...</td>\n",
|
281 |
-
" <td>...</td>\n",
|
282 |
-
" <td>...</td>\n",
|
283 |
-
" <td>...</td>\n",
|
284 |
-
" <td>...</td>\n",
|
285 |
-
" <td>...</td>\n",
|
286 |
-
" <td>...</td>\n",
|
287 |
-
" <td>...</td>\n",
|
288 |
-
" <td>...</td>\n",
|
289 |
-
" <td>...</td>\n",
|
290 |
-
" <td>...</td>\n",
|
291 |
-
" <td>...</td>\n",
|
292 |
-
" </tr>\n",
|
293 |
-
" <tr>\n",
|
294 |
-
" <th>3267</th>\n",
|
295 |
-
" <td>0</td>\n",
|
296 |
-
" <td>11</td>\n",
|
297 |
-
" <td>Smokers of Reddit- What are your reasons for s...</td>\n",
|
298 |
-
" <td>/r/AskReddit/comments/15qzen/smokers_of_reddit...</td>\n",
|
299 |
-
" <td>I'm very curious as to what causes someone to ...</td>\n",
|
300 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
301 |
-
" <td>2013-01-01 00:01:36+00:00</td>\n",
|
302 |
-
" <td>kelsofb</td>\n",
|
303 |
-
" <td>15qzen</td>\n",
|
304 |
-
" <td>0</td>\n",
|
305 |
-
" <td>0</td>\n",
|
306 |
-
" <td>2013-01-01</td>\n",
|
307 |
-
" <td>00:01:36</td>\n",
|
308 |
-
" </tr>\n",
|
309 |
-
" <tr>\n",
|
310 |
-
" <th>3268</th>\n",
|
311 |
-
" <td>1</td>\n",
|
312 |
-
" <td>4</td>\n",
|
313 |
-
" <td>Hi</td>\n",
|
314 |
-
" <td>/r/AskReddit/comments/15qzei/hi/</td>\n",
|
315 |
-
" <td></td>\n",
|
316 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
317 |
-
" <td>2013-01-01 00:01:34+00:00</td>\n",
|
318 |
-
" <td>ImJE5US</td>\n",
|
319 |
-
" <td>15qzei</td>\n",
|
320 |
-
" <td>0</td>\n",
|
321 |
-
" <td>1</td>\n",
|
322 |
-
" <td>2013-01-01</td>\n",
|
323 |
-
" <td>00:01:34</td>\n",
|
324 |
-
" </tr>\n",
|
325 |
-
" <tr>\n",
|
326 |
-
" <th>3269</th>\n",
|
327 |
-
" <td>1</td>\n",
|
328 |
-
" <td>2</td>\n",
|
329 |
-
" <td>At the stroke of midnight I was writing this p...</td>\n",
|
330 |
-
" <td>/r/AskReddit/comments/15qzdx/at_the_stroke_of_...</td>\n",
|
331 |
-
" <td></td>\n",
|
332 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
333 |
-
" <td>2013-01-01 00:01:15+00:00</td>\n",
|
334 |
-
" <td>Sangfroid_Sonder</td>\n",
|
335 |
-
" <td>15qzdx</td>\n",
|
336 |
-
" <td>0</td>\n",
|
337 |
-
" <td>1</td>\n",
|
338 |
-
" <td>2013-01-01</td>\n",
|
339 |
-
" <td>00:01:15</td>\n",
|
340 |
-
" </tr>\n",
|
341 |
-
" <tr>\n",
|
342 |
-
" <th>3270</th>\n",
|
343 |
-
" <td>1</td>\n",
|
344 |
-
" <td>2</td>\n",
|
345 |
-
" <td>With all the rape stories in the news, why don...</td>\n",
|
346 |
-
" <td>/r/AskReddit/comments/15qzdc/with_all_the_rape...</td>\n",
|
347 |
-
" <td></td>\n",
|
348 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
349 |
-
" <td>2013-01-01 00:00:58+00:00</td>\n",
|
350 |
-
" <td>[deleted]</td>\n",
|
351 |
-
" <td>15qzdc</td>\n",
|
352 |
-
" <td>0</td>\n",
|
353 |
-
" <td>1</td>\n",
|
354 |
-
" <td>2013-01-01</td>\n",
|
355 |
-
" <td>00:00:58</td>\n",
|
356 |
-
" </tr>\n",
|
357 |
-
" <tr>\n",
|
358 |
-
" <th>3271</th>\n",
|
359 |
-
" <td>0</td>\n",
|
360 |
-
" <td>3</td>\n",
|
361 |
-
" <td>Do beautiful people have low entropy?</td>\n",
|
362 |
-
" <td>/r/AskReddit/comments/15qzd3/do_beautiful_peop...</td>\n",
|
363 |
-
" <td>I have been reading about entropy and arrows o...</td>\n",
|
364 |
-
" <td>http://www.reddit.com/r/AskReddit/comments/15q...</td>\n",
|
365 |
-
" <td>2013-01-01 00:00:53+00:00</td>\n",
|
366 |
-
" <td>[deleted]</td>\n",
|
367 |
-
" <td>15qzd3</td>\n",
|
368 |
-
" <td>0</td>\n",
|
369 |
-
" <td>0</td>\n",
|
370 |
-
" <td>2013-01-01</td>\n",
|
371 |
-
" <td>00:00:53</td>\n",
|
372 |
-
" </tr>\n",
|
373 |
-
" </tbody>\n",
|
374 |
-
"</table>\n",
|
375 |
-
"<p>3272 rows × 13 columns</p>\n",
|
376 |
-
"</div>"
|
377 |
-
],
|
378 |
-
"text/plain": [
|
379 |
-
" score num_comments title \\\n",
|
380 |
-
"0 2 4 Reddit, if someone had to describe you to a st... \n",
|
381 |
-
"1 5 24 What kind of car does the average \\nRedditor d... \n",
|
382 |
-
"2 1 5 What movies have made you go back to the theat... \n",
|
383 |
-
"3 0 18 Worst fear(s)? \n",
|
384 |
-
"4 11 29 If there was a type of ink that lasted only fo... \n",
|
385 |
-
"... ... ... ... \n",
|
386 |
-
"3267 0 11 Smokers of Reddit- What are your reasons for s... \n",
|
387 |
-
"3268 1 4 Hi \n",
|
388 |
-
"3269 1 2 At the stroke of midnight I was writing this p... \n",
|
389 |
-
"3270 1 2 With all the rape stories in the news, why don... \n",
|
390 |
-
"3271 0 3 Do beautiful people have low entropy? \n",
|
391 |
-
"\n",
|
392 |
-
" permalink \\\n",
|
393 |
-
"0 /r/AskReddit/comments/15sn6y/reddit_if_someone... \n",
|
394 |
-
"1 /r/AskReddit/comments/15sn6m/what_kind_of_car_... \n",
|
395 |
-
"2 /r/AskReddit/comments/15sn6b/what_movies_have_... \n",
|
396 |
-
"3 /r/AskReddit/comments/15sn4u/worst_fears/ \n",
|
397 |
-
"4 /r/AskReddit/comments/15sn44/if_there_was_a_ty... \n",
|
398 |
-
"... ... \n",
|
399 |
-
"3267 /r/AskReddit/comments/15qzen/smokers_of_reddit... \n",
|
400 |
-
"3268 /r/AskReddit/comments/15qzei/hi/ \n",
|
401 |
-
"3269 /r/AskReddit/comments/15qzdx/at_the_stroke_of_... \n",
|
402 |
-
"3270 /r/AskReddit/comments/15qzdc/with_all_the_rape... \n",
|
403 |
-
"3271 /r/AskReddit/comments/15qzd3/do_beautiful_peop... \n",
|
404 |
-
"\n",
|
405 |
-
" selftext \\\n",
|
406 |
-
"0 They would be talking about you without your p... \n",
|
407 |
-
"1 I've always wanted to know what kind of car th... \n",
|
408 |
-
"2 \n",
|
409 |
-
"3 So what is your worst fear, reddit? \n",
|
410 |
-
"4 \n",
|
411 |
-
"... ... \n",
|
412 |
-
"3267 I'm very curious as to what causes someone to ... \n",
|
413 |
-
"3268 \n",
|
414 |
-
"3269 \n",
|
415 |
-
"3270 \n",
|
416 |
-
"3271 I have been reading about entropy and arrows o... \n",
|
417 |
-
"\n",
|
418 |
-
" url \\\n",
|
419 |
-
"0 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
420 |
-
"1 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
421 |
-
"2 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
422 |
-
"3 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
423 |
-
"4 http://www.reddit.com/r/AskReddit/comments/15s... \n",
|
424 |
-
"... ... \n",
|
425 |
-
"3267 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
426 |
-
"3268 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
427 |
-
"3269 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
428 |
-
"3270 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
429 |
-
"3271 http://www.reddit.com/r/AskReddit/comments/15q... \n",
|
430 |
-
"\n",
|
431 |
-
" created_utc author id downs ups \\\n",
|
432 |
-
"0 2013-01-01 23:59:40+00:00 [deleted] 15sn6y 0 2 \n",
|
433 |
-
"1 2013-01-01 23:59:31+00:00 PaytonAdams 15sn6m 0 5 \n",
|
434 |
-
"2 2013-01-01 23:59:20+00:00 [deleted] 15sn6b 0 1 \n",
|
435 |
-
"3 2013-01-01 23:58:37+00:00 [deleted] 15sn4u 0 0 \n",
|
436 |
-
"4 2013-01-01 23:58:15+00:00 Honeybeard 15sn44 0 11 \n",
|
437 |
-
"... ... ... ... ... ... \n",
|
438 |
-
"3267 2013-01-01 00:01:36+00:00 kelsofb 15qzen 0 0 \n",
|
439 |
-
"3268 2013-01-01 00:01:34+00:00 ImJE5US 15qzei 0 1 \n",
|
440 |
-
"3269 2013-01-01 00:01:15+00:00 Sangfroid_Sonder 15qzdx 0 1 \n",
|
441 |
-
"3270 2013-01-01 00:00:58+00:00 [deleted] 15qzdc 0 1 \n",
|
442 |
-
"3271 2013-01-01 00:00:53+00:00 [deleted] 15qzd3 0 0 \n",
|
443 |
-
"\n",
|
444 |
-
" date time \n",
|
445 |
-
"0 2013-01-01 23:59:40 \n",
|
446 |
-
"1 2013-01-01 23:59:31 \n",
|
447 |
-
"2 2013-01-01 23:59:20 \n",
|
448 |
-
"3 2013-01-01 23:58:37 \n",
|
449 |
-
"4 2013-01-01 23:58:15 \n",
|
450 |
-
"... ... ... \n",
|
451 |
-
"3267 2013-01-01 00:01:36 \n",
|
452 |
-
"3268 2013-01-01 00:01:34 \n",
|
453 |
-
"3269 2013-01-01 00:01:15 \n",
|
454 |
-
"3270 2013-01-01 00:00:58 \n",
|
455 |
-
"3271 2013-01-01 00:00:53 \n",
|
456 |
-
"\n",
|
457 |
-
"[3272 rows x 13 columns]"
|
458 |
-
]
|
459 |
-
},
|
460 |
-
"execution_count": 29,
|
461 |
-
"metadata": {},
|
462 |
-
"output_type": "execute_result"
|
463 |
-
}
|
464 |
-
],
|
465 |
-
"source": [
|
466 |
-
"df = dataset['all_days'].to_pandas()\n",
|
467 |
-
"df"
|
468 |
-
]
|
469 |
-
},
|
470 |
-
{
|
471 |
-
"cell_type": "code",
|
472 |
-
"execution_count": 16,
|
473 |
-
"id": "28df4b06",
|
474 |
-
"metadata": {},
|
475 |
-
"outputs": [
|
476 |
-
{
|
477 |
-
"data": {
|
478 |
-
"text/plain": [
|
479 |
-
"score Int64\n",
|
480 |
-
"num_comments Int64\n",
|
481 |
-
"title string\n",
|
482 |
-
"permalink string\n",
|
483 |
-
"selftext string\n",
|
484 |
-
"url string\n",
|
485 |
-
"created_utc string\n",
|
486 |
-
"author string\n",
|
487 |
-
"id string\n",
|
488 |
-
"downs Int64\n",
|
489 |
-
"ups Int64\n",
|
490 |
-
"dtype: object"
|
491 |
-
]
|
492 |
-
},
|
493 |
-
"execution_count": 16,
|
494 |
-
"metadata": {},
|
495 |
-
"output_type": "execute_result"
|
496 |
-
}
|
497 |
-
],
|
498 |
-
"source": [
|
499 |
-
"df.convert_dtypes().dtypes"
|
500 |
-
]
|
501 |
-
},
|
502 |
-
{
|
503 |
-
"cell_type": "code",
|
504 |
-
"execution_count": 18,
|
505 |
-
"id": "e322b6c0",
|
506 |
-
"metadata": {},
|
507 |
-
"outputs": [],
|
508 |
-
"source": [
|
509 |
-
"import pandas as pd"
|
510 |
-
]
|
511 |
-
},
|
512 |
-
{
|
513 |
-
"cell_type": "code",
|
514 |
-
"execution_count": 21,
|
515 |
-
"id": "ed1b06c3",
|
516 |
-
"metadata": {},
|
517 |
-
"outputs": [],
|
518 |
-
"source": [
|
519 |
-
"df['created_utc'] = pd.to_datetime(df['created_utc'])\n",
|
520 |
-
"df['date'] = df['created_utc'].dt.date\n",
|
521 |
-
"df['time'] = df['created_utc'].dt.time"
|
522 |
-
]
|
523 |
-
},
|
524 |
-
{
|
525 |
-
"cell_type": "code",
|
526 |
-
"execution_count": 33,
|
527 |
-
"id": "ff477737",
|
528 |
-
"metadata": {},
|
529 |
-
"outputs": [
|
530 |
-
{
|
531 |
-
"data": {
|
532 |
-
"text/plain": [
|
533 |
-
"2013-01-01 3272\n",
|
534 |
-
"Name: date, dtype: int64"
|
535 |
-
]
|
536 |
-
},
|
537 |
-
"execution_count": 33,
|
538 |
-
"metadata": {},
|
539 |
-
"output_type": "execute_result"
|
540 |
-
}
|
541 |
-
],
|
542 |
-
"source": [
|
543 |
-
"df.date.value_counts()"
|
544 |
-
]
|
545 |
-
},
|
546 |
-
{
|
547 |
-
"cell_type": "code",
|
548 |
-
"execution_count": 26,
|
549 |
-
"id": "1d11b967",
|
550 |
-
"metadata": {},
|
551 |
-
"outputs": [],
|
552 |
-
"source": [
|
553 |
-
"new_df = df.drop_duplicates(subset=['id'], keep=\"first\")"
|
554 |
-
]
|
555 |
-
},
|
556 |
-
{
|
557 |
-
"cell_type": "code",
|
558 |
-
"execution_count": 27,
|
559 |
-
"id": "eec00dd6",
|
560 |
-
"metadata": {},
|
561 |
-
"outputs": [
|
562 |
-
{
|
563 |
-
"data": {
|
564 |
-
"text/plain": [
|
565 |
-
"<Axes: >"
|
566 |
-
]
|
567 |
-
},
|
568 |
-
"execution_count": 27,
|
569 |
-
"metadata": {},
|
570 |
-
"output_type": "execute_result"
|
571 |
-
},
|
572 |
-
{
|
573 |
-
"data": {
|
574 |
-
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGdCAYAAAA7VYb2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+sklEQVR4nO3de3hU1b3/8U+CuXFJAiqEaJC0FgG5CgoRpVpiIlALlMJBsFqLUGmwYqwXWosBLxRURAGLHItIJZXSI4jAiaRwNCqRSyRVQan6UPEUE34WSSSUZCDr94dndjO5T7ImMzt5v55nnpC916z92WvW3vNlzyVhxhgjAAAANEt4sAMAAAC0BhRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGDBOcEOEEyVlZU6evSoOnXqpLCwsGDHAQAAjWCM0ddff63ExESFh4fO9aE2XVQdPXpUSUlJwY4BAACa4PPPP9eFF14Y7BiONl1UderUSdI3D0psbKy1fj0ej7Zv3660tDRFRERY6zfQ3Jpbcnd2ifzB5ObsknvzuzW35O7sUuvIv2nTJt12223O83ioaNNFlfclv9jYWOtFVfv27RUbG+uqCevW3JK7s0vkDyY3Z5fcm9+tuSV3Z5daT35JIffWndB5IRIAAMDFKKoAAAAs8LuoysvL0w033KDExESFhYVp06ZNzjqPx6P77rtP/fv3V4cOHZSYmKibb75ZR48e9enj+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15cI8uGDRvUu3dvRUdHq3///tq2bZu/uwMAAGCF30VVWVmZBg4cqBUrVtRYd+rUKb377rv6zW9+o3fffVcvv/yyDh06pB/84Ac+7aZNm6YDBw4oNzdXW7ZsUV5enmbOnOmsLy0tVVpami666CIVFBToscceU1ZWllatWuW02bVrl2688UZNnz5d+/fv1/jx4zV+/Hh98MEH/u4SAABAs/n9RvXRo0dr9OjRta6Li4tTbm6uz7Lly5friiuu0JEjR9SjRw99+OGHysnJ0d69ezV06FBJ0rJlyzRmzBg9/vjjSkxM1Lp161RRUaHVq1crMjJSl156qQoLC7VkyRKn+Hrqqad0/fXX65577pEkPfTQQ8rNzdXy5cu1cuVKf3cLAACgWQL+6b+SkhKFhYUpPj5ekpSfn6/4+HinoJKk1NRUhYeHa/fu3ZowYYLy8/M1cuRIRUZGOm3S09O1aNEiffXVV+rcubPy8/OVmZnps6309HSflyOrKy8vV3l5ufN7aWmppG9etvR4PBb2Vk5/VX+6hVtzS+7OLpE/mNycXXJvfrfmltydXWo9+UNRQIuq06dP67777tONN97ofGVBUVGRunbt6hvinHPUpUsXFRUVOW2Sk5N92nTr1s1Z17lzZxUVFTnLqrbx9lGbhQsXav78+TWWb9++3fl4pk3Vr9q5hVtzS+7OLpE/mNycXXJvfrfmltydXXJ//lAUsKLK4/Fo8uTJMsbod7/7XaA245e5c+f6XN0qLS1VUlKS0tLSrH9PVW5urq677jpXfQeIW3NL7s4ukT+Y3Jxdcm9+t+aW3J1dah35X3nllWDHqFVAiipvQfXZZ59p586dPgVLQkKCjh075tP+zJkzOn78uBISEpw2xcXFPm28vzfUxru+NlFRUYqKiqqxPCIiIiATK1D9Bppbc0vuzi6RP5jcnF1yb3635pbcnV1yf/5QZP17qrwF1ccff6y//OUvOvfcc33Wp6Sk6MSJEyooKHCW7dy5U5WVlRo2bJjTJi8vz+d109zcXF1yySXq3Lmz02bHjh0+fefm5iolJcX2LgEAADTI76Lq5MmTKiwsVGFhoSTp8OHDKiws1JEjR+TxePSjH/1I+/bt07p163T27FkVFRWpqKhIFRUVkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566imfl+7uvPNO5eTk6IknntBHH32krKws7du3T7Nnz7YwLAAAAP7xu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0//+Mc/tHnzZv3v//6vBg0apO7duzu3Xbt2OX2sW7dOvXv31qhRozRmzBhdddVVPt9BFRcXp+3bt+vw4cMaMmSI7r77bs2bN8/nu6yuvPJKZWdna9WqVRo4cKD+/Oc/a9OmTerXr19zxgMAAKBJ/H5P1TXXXCNjTJ3r61vn1aVLF2VnZ9fbZsCAAXrzzTfrbTNp0iRNmjSpwe0BAAAEGn/7DwAAwAKKKgAA4Jee928NdoSQRFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAA1KLn/VuDHQEuQ1EFAABgAUUVAACoFVfr/ENRBQAAYAFFFQAAgAUUVQAA+KFf1mvBjoAQRVEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQCgWfiCSOAbFFUAEEQUJEDrQVEFAABgAUUVAACABRRVAACgWXrev5WXskVRBQAAYIXfRVVeXp5uuOEGJSYmKiwsTJs2bfJZb4zRvHnz1L17d8XExCg1NVUff/yxT5vjx49r2rRpio2NVXx8vKZPn66TJ0/6tHnvvfd09dVXKzo6WklJSVq8eHGNLBs2bFDv3r0VHR2t/v37a9u2bf7uDgAAgBV+F1VlZWUaOHCgVqxYUev6xYsX6+mnn9bKlSu1e/dudejQQenp6Tp9+rTTZtq0aTpw4IByc3O1ZcsW5eXlaebMmc760tJSpaWl6aKLLlJBQYEee+wxZWVladWqVU6bXbt26cYbb9T06dO1f/9+jR8/XuPHj9cHH3zg7y4BAAA02zn+3mH06NEaPXp0reuMMVq6dKkeeOABjRs3TpK0du1adevWTZs2bdKUKVP04YcfKicnR3v37tXQoUMlScuWLdOYMWP0+OOPKzExUevWrVNFRYVWr16tyMhIXXrppSosLNSSJUuc4uupp57S9ddfr3vuuUeS9NBDDyk3N1fLly/XypUrmzQYANDaed/38vffjg1yEqD18buoqs/hw4dVVFSk1NRUZ1lcXJyGDRum/Px8TZkyRfn5+YqPj3cKKklKTU1VeHi4du/erQkTJig/P18jR45UZGSk0yY9PV2LFi3SV199pc6dOys/P1+ZmZk+209PT6/xcmRV5eXlKi8vd34vLS2VJHk8Hnk8nubuvsPbl80+W4Jbc0vuzi6RP5iCnT2qnWnWtv3NH9XO+NW+sX3621+wx70x6tqvqHD7Y9iS/Bn7Oseg2vJAzKu6hPK4hxljTJPvHBamjRs3avz48ZK+eUluxIgROnr0qLp37+60mzx5ssLCwrR+/Xo9+uijeuGFF3To0CGfvrp27ar58+dr1qxZSktLU3Jysp599lln/cGDB3XppZfq4MGD6tOnjyIjI/XCCy/oxhtvdNo888wzmj9/voqLi2vNm5WVpfnz59dYnp2drfbt2zd1GAAAQAs6deqUpk6dqpKSEsXGxgY7jsPqlapQN3fuXJ+rW6WlpUpKSlJaWprVB8Xj8Sg3N1fXXXedIiIirPUbaG7NLbk7u0T+YAp29n5Zr+mDrPQm39/f/P2yXpOkZm2ztj797a+p497c8fJHXdsasiBHDw2tdOV8l/wb+7rGoPryQMyrung8Hr3yyisB305TWC2qEhISJEnFxcU+V6qKi4s1aNAgp82xY8d87nfmzBkdP37cuX9CQkKNq03e3xtq411fm6ioKEVFRdVYHhEREZADI1D9Bppbc0vuzi6RP5iClb38bJiV7daWv+f9W2u8d6r8bJjT3pbm7IO/425rvJqzrfLKf4+hW+e71Lj8dY5BteWBmFduZPV7qpKTk5WQkKAdO3Y4y0pLS7V7926lpKRIklJSUnTixAkVFBQ4bXbu3KnKykoNGzbMaZOXl+fzumlubq4uueQSde7c2WlTdTveNt7tAAAAtCS/i6qTJ0+qsLBQhYWFkr55c3phYaGOHDmisLAwzZkzRw8//LA2b96s999/XzfffLMSExOd91316dNH119/vWbMmKE9e/bo7bff1uzZszVlyhQlJiZKkqZOnarIyEhNnz5dBw4c0Pr16/XUU0/5vHR35513KicnR0888YQ++ugjZWVlad++fZo9e3bzRwUBwzfuAqhNU88NbjinuCGjTW1tf6vyu6jat2+fBg8erMGDB0uSMjMzNXjwYM2bN0+SdO+99+qOO+7QzJkzdfnll+vkyZPKyclRdHS008e6devUu3dvjRo1SmPGjNFVV13l8x1UcXFx2r59uw4fPqwhQ4bo7rvv1rx583y+y+rKK69Udna2Vq1apYEDB+rPf/6zNm3apH79+jV5MACgLs19omjLTzRAW+H3e6quueYa1feBwbCwMC1YsEALFiyos02XLl2UnZ1d73YGDBigN998s942kyZN0qRJk+oPDAAhpLb3OQV7u8HKBLQ2/O0/AECbF4g/CMzVybaHogoAAMACiioAaGW4QhLaAnFVDKGBogohg5MM3Ka1Pzm25n0DAoGiCkHByRpAa9Gazmf+7ktr2ncbKKoAAAAsoKgCAIQkroLAbSiqAAAALKCoAgA0CleO3IXHq+VRVAEAWlQoPNmHQobqassUijlRN4oqAAAACyiqENJa+/cAAQBaD4oqAAhB/GeiZTDOsImiCgAAwAKKKuD/8FIjYB/HlDvwONlBURVCmNQAALgXRRUAVMN/cAA0BUUVAIQQCjrAvSiqAABWURhCapvzgKIKVrTFgwcAgoHzbeiiqELQcYIAALQGFFXwCwUQYAfHEmxiPoUGiiq4DicPAI3F+QItiaIKACziSbx2jEtwMf4tg6IK9eJAdBceLwAIHooqAAAACyiqAABAg7gS3jCKKgCAX/jj40DtKKrQZrnpScFNWQGgraKoQqvVmguRxu4bVxRQH+YGYBdFVRD484SI4ONxAAA0BkUVAAAtgP+gtX4UVQAAABZQVAFNwP84EWr6Zb0W7AghjfcXoiVQVLUCnCgAAAg+iiqgDWsrBXlr28/Wtj9Aa0FRhZDEkwYAwG0oqhAQvH+h9eHxBID6UVQBAABYQFHVSnFVAWgcPjUHwBaKKgCtBv+ZgBdzAcFAUeVCnCxaDx5LNBdX2oDQQVGFNodCpn6MD1qSm4tCN2dHYFBUAS2MT0YCCHWco5qGogoO/tcFwO2CXQzwn6a2jaKqBXGghSYeF/fisUMgMK/QVBRVAOACbeUKSGP3sS2MBdyHogoAgAZQxKExKKqANsLNTwq2srt5DACEPutF1dmzZ/Wb3/xGycnJiomJ0be//W099NBDMsY4bYwxmjdvnrp3766YmBilpqbq448/9unn+PHjmjZtmmJjYxUfH6/p06fr5MmTPm3ee+89XX311YqOjlZSUpIWL15se3cAK3gyBxAKOBcFlvWiatGiRfrd736n5cuX68MPP9SiRYu0ePFiLVu2zGmzePFiPf3001q5cqV2796tDh06KD09XadPn3baTJs2TQcOHFBubq62bNmivLw8zZw501lfWlqqtLQ0XXTRRSooKNBjjz2mrKwsrVq1yvYutUr1HVj9sl7jwAMAwE/Wi6pdu3Zp3LhxGjt2rHr27Kkf/ehHSktL0549eyR9c5Vq6dKleuCBBzRu3DgNGDBAa9eu1dGjR7Vp0yZJ0ocffqicnBw999xzGjZsmK666iotW7ZML730ko4ePSpJWrdunSoqKrR69WpdeumlmjJlin7xi19oyZIltncJaJS2Voi2xv1tzj61hvFoDfsABNM5tju88sortWrVKv3tb39Tr1699Ne//lVvvfWWU+wcPnxYRUVFSk1Nde4TFxenYcOGKT8/X1OmTFF+fr7i4+M1dOhQp01qaqrCw8O1e/duTZgwQfn5+Ro5cqQiIyOdNunp6Vq0aJG++uorde7cuUa28vJylZeXO7+XlpZKkjwejzwej7Ux8PZVvc+odkYej8f52S/rNX2QlV5jfV2/+9OurvvWe59w4/OzatamZPL2UVu7qu0bWla9n9oy1DbmDY1zbT/ry9yUZbWtq7od7/K65kxdfTflcWjuftSXoaH8zdWcY6Hq8vrmfW1zoqEMtY1vYzPU1b6+5dW3VTV/VLj/x1Rjs9SWzdtHQ49DfVmq5m6oXUPj2Jhxri1zY8altsy1ZW9oTgTyXNPQstr2qbHjVusY+Dl+NgXqPGNDmKn6ZicLKisr9atf/UqLFy9Wu3btdPbsWT3yyCOaO3eupG+uZI0YMUJHjx5V9+7dnftNnjxZYWFhWr9+vR599FG98MILOnTokE/fXbt21fz58zVr1iylpaUpOTlZzz77rLP+4MGDuvTSS3Xw4EH16dOnRrasrCzNnz+/xvLs7Gy1b9/e1hAAAIAAOnXqlKZOnaqSkhLFxsYGO86/Gcv++Mc/mgsvvND88Y9/NO+9955Zu3at6dKli1mzZo0xxpi3337bSDJHjx71ud+kSZPM5MmTjTHGPPLII6ZXr141+j7//PPNM888Y4wx5rrrrjMzZ870WX/gwAEjyRw8eLDWbKdPnzYlJSXO7fPPPzeSzJdffmkqKiqs3crKysymTZtMWVmZz/Jev3q11p/V19f1uz/tGlpW2/r+D2w2mzZtMv0f2NzojPVlqi9DbVnqWtaYDLWNeUP7UNvPpoxlQ2NT275V33Zdc8bG3KhrHPzdj/oyNJS/ubfmHAv1zYWKin/P+7KysiaNVUPHdWPmnb/zs+p9qh639e1vff0091ip73GoK4t3znhz15e5MZmamrkx49LQmDd0fAfqXOPP8V21XWPOlw2Ngb/jZ/NWVlZmsrOzjSRTUlLS7LrFJusv/91zzz26//77NWXKFElS//799dlnn2nhwoW65ZZblJCQIEkqLi72uVJVXFysQYMGSZISEhJ07Ngxn37PnDmj48ePO/dPSEhQcXGxTxvv79421UVFRSkqKqrG8oiICEVERDRhb+tXvd/ys2GKiIio8bP6+rp+96ddQ8tqXV8Z5vxsbMb6MnnHoLZ2tWWpa1n1furK4G1XX7+1La9vW3WOVSOX1bau6naq36euuWjjcWjufjQmQ6COpeZkqrq8vnlf37FZV9+1jW9jMzRnfvrMpyrHrb/HlI0sDT0O9WWpmruhdg1lamrmxoxLbZlry97QnLB9rvHn+K5tn+o7XzY4Bn6OX1th/Y3qp06dUni4b7ft2rVTZWWlJCk5OVkJCQnasWOHs760tFS7d+9WSkqKJCklJUUnTpxQQUGB02bnzp2qrKzUsGHDnDZ5eXk+r63m5ubqkksuqfX9VG7Cm0UB92uLx3Fb3GegKutF1Q033KBHHnlEW7du1d///ndt3LhRS5Ys0YQJEyRJYWFhmjNnjh5++GFt3rxZ77//vm6++WYlJiZq/PjxkqQ+ffro+uuv14wZM7Rnzx69/fbbmj17tqZMmaLExERJ0tSpUxUZGanp06frwIEDWr9+vZ566illZmba3iW4DCd29+MxBOBG1l/+W7ZsmX7zm9/o5z//uY4dO6bExET97Gc/07x585w29957r8rKyjRz5kydOHFCV111lXJychQdHe20WbdunWbPnq1Ro0YpPDxcEydO1NNPP+2sj4uL0/bt25WRkaEhQ4bovPPO07x583y+ywpA6PAWSn//7dggJ2m+nvdvbRX7AcAu60VVp06dtHTpUi1durTONmFhYVqwYIEWLFhQZ5suXbooOzu73m0NGDBAb775ZlOjAgHHky9jADuYR3AD/vYfAACABRRVAAAAFlBUAQAAWEBRBYSAflmvNfm+tX1Sjk/PAUDLo6gCAACwgKIKLcLGlROuvgAAQhlFFQDUg2IegcYcaz0oqgAACCEUWe5FUQXUgRMbAMAfFFUAgioYxSsFM4BAoKiCVW3tycr2/jbnqxUAAMFFUQVAUtsriAG34RgNfRRVAdbz/q2t+kBozfvWWvGYAUBgUFQBAABYQFEFAABgAUVVK9KUlxp5KQg28UZ7AG0ZRRUAAIAFFFVAI7X2Dx0AAJqHogpAi6M4BdAaUVQBAABYQFEFICRw9QqA21FUAWi1KNQAtCSKKrgaT5oAgFBBUYVGoXgBAKB+FFVAAFCEMgYA2h6KKgAAAAsoqtoArhgAABB4FFUAmoWiHQC+QVEFuAgFDACELooqAAAACyiqXICrEwAAhD6KKgAAAAsoqlyMK1itV8/7twb98Q329v3hpqwAWi+KKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAIYs3oANwE4oqAAAACyiqAAAALKCoAgAAsICiCoBfeJ8TANSOogoAAMACiioAAAALKKoAuEoo/F1EAKgNRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgQUCKqn/84x+66aabdO655yomJkb9+/fXvn37nPXGGM2bN0/du3dXTEyMUlNT9fHHH/v0cfz4cU2bNk2xsbGKj4/X9OnTdfLkSZ827733nq6++mpFR0crKSlJixcvDsTuAAAANMh6UfXVV19pxIgRioiI0H//93/r4MGDeuKJJ9S5c2enzeLFi/X0009r5cqV2r17tzp06KD09HSdPn3aaTNt2jQdOHBAubm52rJli/Ly8jRz5kxnfWlpqdLS0nTRRRepoKBAjz32mLKysrRq1SrbuwQAANCgc2x3uGjRIiUlJen55593liUnJzv/NsZo6dKleuCBBzRu3DhJ0tq1a9WtWzdt2rRJU6ZM0YcffqicnBzt3btXQ4cOlSQtW7ZMY8aM0eOPP67ExEStW7dOFRUVWr16tSIjI3XppZeqsLBQS5Ys8Sm+AAAAWoL1omrz5s1KT0/XpEmT9MYbb+iCCy7Qz3/+c82YMUOSdPjwYRUVFSk1NdW5T1xcnIYNG6b8/HxNmTJF+fn5io+PdwoqSUpNTVV4eLh2796tCRMmKD8/XyNHjlRkZKTTJj09XYsWLdJXX33lc2XMq7y8XOXl5c7vpaWlkiSPxyOPx2NtDLx9eTweRbUzPv+u7adXXev9aVd9W7W1r3NZuPH5aStLfe0ak6+uLFXbVR/zQGVuTJbG7G+NfqqMfVPHL5CZGzs/vfkbm6Wx87PB8Wtm5saMkb/zs8WO72pzp6Uy+zs/65x34Q3f1/b4NXZ+1pe5tuyNnZ+BPL6rauhx8PtYaca5xqZA9GlLmDHG2OwwOjpakpSZmalJkyZp7969uvPOO7Vy5Urdcsst2rVrl0aMGKGjR4+qe/fuzv0mT56ssLAwrV+/Xo8++qheeOEFHTp0yKfvrl27av78+Zo1a5bS0tKUnJysZ5991ll/8OBBXXrppTp48KD69OlTI1tWVpbmz59fY3l2drbat29vawgAAEAAnTp1SlOnTlVJSYliY2ODHeffjGUREREmJSXFZ9kdd9xhhg8fbowx5u233zaSzNGjR33aTJo0yUyePNkYY8wjjzxievXqVaPv888/3zzzzDPGGGOuu+46M3PmTJ/1Bw4cMJLMwYMHa812+vRpU1JS4tw+//xzI8l8+eWXpqKiwtqtrKzMbNq0yZSVlZlev3rV9PrVq6aioqLOn96bjXa13aexy/o/sNls2rTJ9H9gs9UsDfXT0LLG3Lf6mAcqc2OyNGZ/qy+vOvZNHb9AZm5o3Lzj781ve342NH7NOaa8Y1/b3GnO/Gyp47v63GmpzM09vqvPmabOz2Ac37Udr/7Mz0Ae37WNRV3Ha1lZmV/j25zxs3krKysz2dnZRpIpKSlpWrESINZf/uvevbv69u3rs6xPnz76r//6L0lSQkKCJKm4uNjnSlVxcbEGDRrktDl27JhPH2fOnNHx48ed+yckJKi4uNinjfd3b5vqoqKiFBUVVWN5RESEIiIiGruLjRYREaHys2E+/67tp1dd6/1pV31btbWvc1llmPPTZpb62jUmX13jV7Vd9TEPVObGZmlof2v0U2Xsmzp+gczc2Pnpzd/YLI2dnw2OXzMzN2aM/J2fLXZ8V5s7LZXZ3/lZ57yrbPi+tsevsfOzvsy1ZW/s/Azk8V1VQ49DREREgxlsnWvaCuuf/hsxYkSNl+3+9re/6aKLLpL0zZvWExIStGPHDmd9aWmpdu/erZSUFElSSkqKTpw4oYKCAqfNzp07VVlZqWHDhjlt8vLyfF5bzc3N1SWXXFLr+6kAAAACyXpRddddd+mdd97Ro48+qk8++UTZ2dlatWqVMjIyJElhYWGaM2eOHn74YW3evFnvv/++br75ZiUmJmr8+PGSvrmydf3112vGjBnas2eP3n77bc2ePVtTpkxRYmKiJGnq1KmKjIzU9OnTdeDAAa1fv15PPfWUMjMzbe8SAABAg6y//Hf55Zdr48aNmjt3rhYsWKDk5GQtXbpU06ZNc9rce++9Kisr08yZM3XixAldddVVysnJcd7kLknr1q3T7NmzNWrUKIWHh2vixIl6+umnnfVxcXHavn27MjIyNGTIEJ133nmaN28eX6cAAACCwnpRJUnf//739f3vf7/O9WFhYVqwYIEWLFhQZ5suXbooOzu73u0MGDBAb775ZpNzAgAA2MLf/gMAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAABaiZ73bw12hDaNogoAAMACiioAAAALKKoAAECj8RJj3SiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgCghQT7Db7B3j7Q2lFUAQAAWEBRBQAAYAFFFQAA8BsvJ9dEUQUAAGABRRUABAn/0wdaF4oqAAgxNootCjag5VFUAQBCSlMLQgpJBNs5wQ4AAABClz/FalsvbLlSBQBoUxrzxN/WiwM0DUUVAMAVAlXo1NcvxRX8QVEFAG1YWysaWnp/29r4tnUUVQgJnHjgBnXNU+avHW4ex8Zmd8M+ejO6IWuo4Y3qAADrRUFt7Xrev1V//+1Yv3I1pu+WevL3bieqXYtsDi7ElSoAAAALuFIFAEHmvQJi4ypOc/GST9vFY998XKkCgCbgCcgX48EYgCtVaAV63r9VUe2MFl8R7CQAGhLqhUeo5AuVHPAPV6oAwA+h9GQXSllsa837htaLogquxAkXgdbz/q3MM7QYG19AWtcnLpnHLYeiCiElWAc/Jx2gdhwbjAEaj6IqyFr7wdra988teBwQKMwt4N94o3oI4KRkT7+s1ySFNdiuNY95a963YGFMEWqYk6GJK1UAAKDJKPD+jaIKABBUPCmjteDlPzRboE6InGjr/1tpjE/o4rGxh7H0xXiENq5UoU7+Hrwt8Vfa2+IJpS3uM0IP87BlhPo4h3q+YKOochl/JnRLFDmhrrn7ZuO7YxCa6vr+nlB6XL1ZQikTgLpRVLUxbngiQcvgMQeCi2Ow9aGoQoM48NFcLTmHmK8AgoWiCkCbQLGFlsR8a5soquA3/paUf+oaq2COIY9f28LjDbQMvlIhRIXiSTAUMwFtVSi8N5JzAuCLK1UB9M2fTGkZbfHkxh9fBhrGfG0deBzdgStVLYwDA27DnAWAxgn4larf/va3CgsL05w5c5xlp0+fVkZGhs4991x17NhREydOVHFxsc/9jhw5orFjx6p9+/bq2rWr7rnnHp05c8anzeuvv67LLrtMUVFRuvjii7VmzZpA7w7agOZ+OWlbuoJGwQWgNm313BDQomrv3r169tlnNWDAAJ/ld911l1599VVt2LBBb7zxho4ePaof/vCHzvqzZ89q7Nixqqio0K5du/TCCy9ozZo1mjdvntPm8OHDGjt2rK699loVFhZqzpw5uu222/Taay33kpsbtNWJ3Rq1xGPZWuYLH6aA1HrmM9wjYEXVyZMnNW3aNP3nf/6nOnfu7CwvKSnR73//ey1ZskTf+973NGTIED3//PPatWuX3nnnHUnS9u3bdfDgQb344osaNGiQRo8erYceekgrVqxQRUWFJGnlypVKTk7WE088oT59+mj27Nn60Y9+pCeffDJQuwQgiHiCRGMxV+ofg5Z8v29bE7D3VGVkZGjs2LFKTU3Vww8/7CwvKCiQx+NRamqqs6x3797q0aOH8vPzNXz4cOXn56t///7q1q2b0yY9PV2zZs3SgQMHNHjwYOXn5/v04W1T9WXG6srLy1VeXu78XlpaKknyeDzyeDzN3WWHt6+ocOOzLKpd3b831M7p8//WVV1evZ+mLvPmrZq7of30dx+qL6+6T/7mr6p69rq2VX17/u5DbRq7H/W2q5K/oX21+ZjXlau+x6zexzLcd67Wl6G2dnUtqz7vJemSX29RVLv6j4Xa9rmued/QuDdGbdvyPoFFtau7XUPL62vj73Hrz7aawu9552fuhvr1p11Tx8Cf4zVQGerro6FtVZ3zXoGYC9W3Ecp92hJmjLE+ki+99JIeeeQR7d27V9HR0brmmms0aNAgLV26VNnZ2br11lt9ihtJuuKKK3Tttddq0aJFmjlzpj777DOfl/JOnTqlDh06aNu2bRo9erR69eqlW2+9VXPnznXabNu2TWPHjtWpU6cUExNTI1dWVpbmz59fY3l2drbat29vcQQAAECgnDp1SlOnTlVJSYliY2ODHcdh/UrV559/rjvvvFO5ubmKjo623X2zzJ07V5mZmc7vpaWlSkpKUlpamtUHxePxKDc3V7/ZF67yyjBJ0gdZ6T6XXKv/XtfyhtrVtr6py6LCjR4aWumTuyn82Tep5qXoxuavqnr2urZVfXvNfRz87aP69r3LhyzIcfIXzLu+3n21+ZjXlauu+VXX8v2//p4z56vmry9D1e021K6x/TXmZY265v11112nwY/sbPD+/vTd2EzN0dTjtiWy1betqnOmOeebpm6/vuUNsXWubE6GpvTpXV51zkdEREgK3EuCVc+9tng8Hr3yyivW+7XBelFVUFCgY8eO6bLLLnOWnT17Vnl5eVq+fLlee+01VVRU6MSJE4qPj3faFBcXKyEhQZKUkJCgPXv2+PTr/XRg1TbVPzFYXFys2NjYWq9SSVJUVJSioqJqLI+IiHAmlk3llWEqPxvmbMP779p+r2t5Q+1qW9+cZdVzN4U/+yapWVmr82ava1vVt9fcx8HfPqpv32n3fyfm8sraH9OGttfcZdVzfec32yXVzFLXcqePavnry1B1uw21a2x/jZkjjX28msKfeWObv8dtS2ard94183zT1O1L/57PTWUjeyAeh8bO8cGP7NShR74vqeZ5yWaWtsT6G9VHjRql999/X4WFhc5t6NChmjZtmvPviIgI7dixw7nPoUOHdOTIEaWkpEiSUlJS9P777+vYsWNOm9zcXMXGxqpv375Om6p9eNt4+0DrEuw3ngZ7+21ZQ2Mfyo9NKGcDYJ/1K1WdOnVSv379fJZ16NBB5557rrN8+vTpyszMVJcuXRQbG6s77rhDKSkpGj58uCQpLS1Nffv21Y9//GMtXrxYRUVFeuCBB5SRkeFcabr99tu1fPly3XvvvfrpT3+qnTt36k9/+pO2bg3Nk1hTT65NuR8ncrgB87Tt4LFGWxGUb1R/8sknFR4erokTJ6q8vFzp6el65plnnPXt2rXTli1bNGvWLKWkpKhDhw665ZZbtGDBAqdNcnKytm7dqrvuuktPPfWULrzwQj333HNKT7f/+i0ANAbFA9C2tUhR9frrr/v8Hh0drRUrVmjFihV13ueiiy7Stm3b6u33mmuu0f79+21EdC1O4m0PjzlsYB4B9vEHlV2qNZ8QW/O+tQW2Hj/mAdA0HDvBQ1EVYlrzwdCa9w0AQh3n4MCjqAoSJjeAtow/lYLWiKIKrRJFKwCgpVFUAQAAWBCUr1RA68ZVotaDxxIIrGAfY8HefmvDlSqgmdriSakt7jMANIQrVWhTKAZqctOYuCkrQgfzBi2FogpB0xZPdG1xnwGgreDlPyDIAlVoUcABQMuiqEKbFoqFRyhmaqrWtC8A0BBe/kObEApP7oHMEAr7BwBtHVeqgACh0AGAtoWiqo3gCd4/jJd/QuFPjvCYAQg2iiqgDaHwqFsoFIYA3I2iCkCLorAD0FpRVAEAAFhAUQWgTeFKGYBAoagCAACwgKIKAADAAoqqNoyXQQAAsIeiCgAAwAKKKgBBx1VTAK0BRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABZQVAEAAFhAUQUAAGABRRUAAIAFFFUAAAAWWC+qFi5cqMsvv1ydOnVS165dNX78eB06dMinzenTp5WRkaFzzz1XHTt21MSJE1VcXOzT5siRIxo7dqzat2+vrl276p577tGZM2d82rz++uu67LLLFBUVpYsvvlhr1qyxvTsAAACNYr2oeuONN5SRkaF33nlHubm58ng8SktLU1lZmdPmrrvu0quvvqoNGzbojTfe0NGjR/XDH/7QWX/27FmNHTtWFRUV2rVrl1544QWtWbNG8+bNc9ocPnxYY8eO1bXXXqvCwkLNmTNHt912m1577TXbuwQAANCgc2x3mJOT4/P7mjVr1LVrVxUUFGjkyJEqKSnR73//e2VnZ+t73/ueJOn5559Xnz599M4772j48OHavn27Dh48qL/85S/q1q2bBg0apIceekj33XefsrKyFBkZqZUrVyo5OVlPPPGEJKlPnz5666239OSTTyo9Pd32bgEAANTLelFVXUlJiSSpS5cukqSCggJ5PB6lpqY6bXr37q0ePXooPz9fw4cPV35+vvr3769u3bo5bdLT0zVr1iwdOHBAgwcPVn5+vk8f3jZz5sypM0t5ebnKy8ud30tLSyVJHo9HHo+n2fvq5e0rKtxY67MlePO6Lbfk7uwS+YPJzdkl9+Z3a27J3dmlls1v87k1kH3aEtCiqrKyUnPmzNGIESPUr18/SVJRUZEiIyMVHx/v07Zbt24qKipy2lQtqLzrvevqa1NaWqp//etfiomJqZFn4cKFmj9/fo3l27dvV/v27Zu2k/V4aGil9T5bgltzS+7OLpE/mNycXXJvfrfmltydXWqZ/Nu2bQv4NkJJQIuqjIwMffDBB3rrrbcCuZlGmzt3rjIzM53fS0tLlZSUpLS0NMXGxlrbjsfjUW5urn6zL1zllWHW+g20qHCjh4ZWui635O7sEvmDyc3ZJffmd2tuyd3ZpZbN/0GW/bfjeDwevfLKK9b7tSFgRdXs2bO1ZcsW5eXl6cILL3SWJyQkqKKiQidOnPC5WlVcXKyEhASnzZ49e3z68346sGqb6p8YLC4uVmxsbK1XqSQpKipKUVFRNZZHREQoIiLC/51sQHllmMrPuu+Ac2tuyd3ZJfIHk5uzS+7N79bckruzSy2TPxDPraHM+qf/jDGaPXu2Nm7cqJ07dyo5Odln/ZAhQxQREaEdO3Y4yw4dOqQjR44oJSVFkpSSkqL3339fx44dc9rk5uYqNjZWffv2ddpU7cPbxtsHAABAS7J+pSojI0PZ2dl65ZVX1KlTJ+c9UHFxcYqJiVFcXJymT5+uzMxMdenSRbGxsbrjjjuUkpKi4cOHS5LS0tLUt29f/fjHP9bixYtVVFSkBx54QBkZGc6Vpttvv13Lly/Xvffeq5/+9KfauXOn/vSnP2nr1q22dwkAAKBB1q9U/e53v1NJSYmuueYade/e3bmtX7/eafPkk0/q+9//viZOnKiRI0cqISFBL7/8srO+Xbt22rJli9q1a6eUlBTddNNNuvnmm7VgwQKnTXJysrZu3arc3FwNHDhQTzzxhJ577jm+TgEAAASF9StVxjT8Ec3o6GitWLFCK1asqLPNRRdd1OCnBq655hrt37/f74wAAAC28bf/AAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALKCoAgAAsICiCgAAwAKKKgAAAAsoqgAAACygqAIAALCAogoAAMACiioAAAALKKoAAAAsoKgCAACwgKIKAADAAooqAAAACyiqAAAALHB9UbVixQr17NlT0dHRGjZsmPbs2RPsSAAAoA1ydVG1fv16ZWZm6sEHH9S7776rgQMHKj09XceOHQt2NAAA0Ma4uqhasmSJZsyYoVtvvVV9+/bVypUr1b59e61evTrY0QAAQBtzTrADNFVFRYUKCgo0d+5cZ1l4eLhSU1OVn59f633Ky8tVXl7u/F5SUiJJOn78uDwej7VsHo9Hp06d0jmecJ2tDLPWb6CdU2l06lSl63JL7s4ukT+Y3Jxdcm9+t+aW3J1datn8//znP6336X2OlSRjjPX+m8W41D/+8Q8jyezatctn+T333GOuuOKKWu/z4IMPGkncuHHjxo0bt1Zw+/zzz1ui5Gg0116paoq5c+cqMzPT+b2yslLHjx/Xueeeq7Awe9V6aWmpkpKS9Pnnnys2NtZav4Hm1tySu7NL5A8mN2eX3Jvfrbkld2eXWk/+gwcPKjExMdhxfLi2qDrvvPPUrl07FRcX+ywvLi5WQkJCrfeJiopSVFSUz7L4+PhARVRsbKwrJ6xbc0vuzi6RP5jcnF1yb3635pbcnV1yf/4LLrhA4eGh9dbw0Erjh8jISA0ZMkQ7duxwllVWVmrHjh1KSUkJYjIAANAWufZKlSRlZmbqlltu0dChQ3XFFVdo6dKlKisr06233hrsaAAAoI1xdVH1H//xH/p//+//ad68eSoqKtKgQYOUk5Ojbt26BTVXVFSUHnzwwRovNYY6t+aW3J1dIn8wuTm75N78bs0tuTu7RP5ACjMm1D6PCAAA4D6ufU8VAABAKKGoAgAAsICiCgAAwAKKKgAAAAvaTFG1cOFCXX755erUqZO6du2q8ePH69ChQz5tTp8+rYyMDJ177rnq2LGjJk6cWOPLRX/xi19oyJAhioqK0qBBg2ps59ChQ7r22mvVrVs3RUdH61vf+pYeeOCBRv1twRUrVqhnz56Kjo7WsGHDtGfPHp/csbGxOu+889SxY0eFhYXpxIkTIZvba+HCheratavCw8MVFhamqKgojRo1Sh999JHTJlTze8f+nHPOUVhYmM/t9ttvD+nsknTvvffWyO29bdiwIeTzL1y4UAMGDFBERITCw8MVERGh0aNH++QLZv68vDzdcMMNSkxMVFhYmDZt2uST/fLLL1dMTIwiIyMVGRmpsLAwFRYWWs1e1SeffKJOnTo1+guNg3m+sZ1b+veYR0ZGql27dmrXrp06d+6scePGOeebUM3uzR8bG1vnuSbU87fU+SZQ+SXp008/1YQJE3T++ecrNjZWkydPrpGvQcH+OzktJT093Tz//PPmgw8+MIWFhWbMmDGmR48e5uTJk06b22+/3SQlJZkdO3aYffv2meHDh5srr7zSp5877rjDLF++3Pz4xz82AwcOrLGdTz/91KxevdoUFhaav//97+aVV14xXbt2NXPnzq0330svvWQiIyPN6tWrzYEDB8yMGTNMfHy8ufbaa53cv/zlL80ll1xi4uPjjSTz1VdfhWzu4uJiZ9xvueUWs2bNGrN161YzYsQIExMTYy644AJz5syZkM7vnTNDhw41P/zhD82oUaPMBRdcYD799FNTUlIS0tmNMSYtLc0sXbrUvP766+Yvf/mLGTVqlImLizMdOnQwX3/9dcjnT01NNeeff74ZNWqU2bBhg7n66qtNTEyMueyyy8zZs2eDnn/btm3m17/+tXn55ZeNJLNx40ZnnXfuPProo2bWrFmmf//+RvL9W6U2sntVVFSYoUOHmtGjR5u4uLh6cxsT/PON7dxVj9cHH3zQ/P73vzfXXnutSUhIMKNHjzZJSUnmzJkzIZvdmG/mzCWXXGImTpzoHK8XXHCBOXr0qNNHKOdvqfNNoPKfPHnSfOtb3zITJkww7733nnnvvffMuHHjzOWXX+6cbxqjzRRV1R07dsxIMm+88YYxxpgTJ06YiIgIs2HDBqfNhx9+aCSZ/Pz8Gvd/8MEH633Aq7rrrrvMVVddVW+bK664wmRkZDi/nz171iQmJpqFCxfWmtt7knNL7tryf/LJJ67I/93vftfceeedrp0zXt78Y8aMcUX+1157zYSHhzsFbNW5k5ubG/T8VVUvqqrbt2+fkWSee+45Y4z9sb/33nvNTTfdZJ5//vlGPcGEyvkmULmrZl+9erWRZPbv3x/y2b3nmqr5A3W+CeTYV80fqPON7fzVzzfezGFhYSY3N7fB/r3azMt/1ZWUlEiSunTpIkkqKCiQx+NRamqq06Z3797q0aOH8vPzm7ydTz75RDk5Ofrud79bZ5uKigoVFBT4bDs8PFypqak1tu3N7eWW3F5ffPGFpG/+ZlNSUpJr8q9bt069evWSJK1du1anTp1yTXavt99+W5J04403Sgr9uVNeXu68ZCz9e+6Hh4frrbfeCmp+f3399deSpLi4OEl2x37nzp3asGGDVqxY0aj2oXK+CWTuqtl37typ5ORkFRcXuyL7unXrdN555+nKK6+UJMXExEhy19hLgT3fBCJ/9fONJEVHRzvnm8Zqk0VVZWWl5syZoxEjRqhfv36SpKKiIkVGRtZ4bbZbt24qKiryextXXnmloqOj9Z3vfEdXX321FixYUGfbL7/8UmfPnq3xTfDVt+3N7c3sltyS9Mwzz6hDhw4aOHCgYmJi9MYbbygyMtIV+adOnaq1a9dqwIAB6tWrl3JycnTTTTe5IruXd+7ExMTopptukhT6c2f48OHq0KGD7rvvPp08eVJ33HGHunfvrsrKSn3xxRdBze+PyspKPfTQQ5Kkiy++WJK9sf/nP/+pn/zkJ1qzZk2j/zBuKJxvAplbkpYvX67evXtLkvbu3avc3Fz985//DPnsU6dO1YsvvqgdO3aoY8eOioyM1MKFCyW5Z+ylwJ5vApW/6vnm1KlTKisr0y9/+UudPXvWuRjQGG2yqMrIyNAHH3ygl156KWDbWL9+vd59911lZ2dr69atevzxxyVJb775pjp27Ojc1q1b1+g+vbnnzZsXqNgByS1J06ZN07hx45SQkKARI0Zo8uTJOn36tCvyz5w5U5s3b9Znn32mHTt2aO3atdq4caP/b2AMQnavn/3sZzpy5Ijuvvtuq5mrsp3//PPP14YNG/Tqq6+qU6dOysnJ0fDhw3XZZZcF5C/TB2r8MzIyanwoxpYZM2Zo6tSpGjlyZK3rQ/V8E8jckrR//34lJCRow4YN6tWrlyZPnqyKigob0QOafebMmUpPT9fKlSv11VdfOeeaTz/91Ep2KfBjLwX2fBOo/FXPNx07dlRcXJxOnDjh9/nG1X/7rylmz56tLVu2KC8vTxdeeKGzPCEhQRUVFTpx4oRPJV1cXKyEhAS/t5OUlCRJ6tu3r86ePauZM2fq7rvv1tChQ30+AdStWzdFRUWpXbt2NZ6kq267au7PPvvMNbm9fv3rX+vNN9/Url27dMEFF6hz587auHGjK/JXnzOdO3eW9M0l5VDP7s3/8ssvq127dvrFL37hLHfD2KelpWn06NHauHGjtm3bpoEDByohIUHf+ta3gpq/sbxz549//KPPk4Ct7Dt37tTmzZudAtAYo8rKSp1zzjlatWqVbrzxxpA83wQqtzf79u3b9eabbyo5OVk/+MEP1LlzZ33yySchn92b3zv2Xbt2lfTNS9JuGHtv/kCebwKZPy0tTZ9++qm+/PJLnXPOOYqPj3fON43VZq5UGWM0e/Zsbdy40XmNvaohQ4YoIiJCO3bscJYdOnRIR44cUUpKSrO2XVlZKY/Ho8rKSsXExOjiiy92bp06dVJkZKSGDBnis+3Kykrt2LFDw4cPd2Vu77ZrG3fzzQckVF5eHtL565oz3gP2qquuCtnsku/Yf/vb39a4ceN0/vnnO+1Deeyr53/99dc1cOBA7dy5U8eOHdMPfvCDoOZvSPW54y3YvGxlz8/PV2FhoXNbsGCBOnXqpMLCQk2YMCFkzze2c9d3vHrPN4mJiSGb3Zuzen7vuaZ79+4hPfbV8wfyfBOo/FWdd955io+P9znfNFqj39LucrNmzTJxcXHm9ddfN1988YVzO3XqlNPm9ttvNz169DA7d+40+/btMykpKSYlJcWnn48//tjs37/f/OxnPzO9evUy+/fvN/v37zfl5eXGGGNefPFFs379enPw4EHz6aefmvXr15vExEQzbdq0evO99NJLJioqyqxZs8YcPHjQzJw508THx5tbbrnFyf3Xv/7V5Obmmscff9xIMnl5eWb//v3mJz/5ScjlLioqMsYYM23aNBMdHW2effZZs3fvXrN582Zz3XXXmc6dOzsfZQ3FcS8qKjKzZs0ynTp1MrfeeqvJyckxe/bsMWvWrDE9e/Y0I0eODOnsxvx7zr/44otGklm3bp0r5nzV/DExMWbFihUmPz/fLFu2zMTHx5s77rgjJPJ//fXXTl+SzJIlS8z+/fvNZ5995oz95s2bTW5urvnDH/5gJJm1a9ea/fv3my+++MJK9uoa+0moYJ9vbOeuerzedtttJicnxznfjBkzxnTp0sUUFxeH5Jh75/vUqVOdc6X3XNOjRw8zYsQIp49Qzt9S55tA5TfGmNWrV5v8/HzzySefmD/84Q+mS5cuJjMzs8G+q2ozRZX+72PB1W/PP/+80+Zf//qX+fnPf246d+5s2rdvbyZMmGC++OILn36++93v1trP4cOHjTHfPHCXXXaZ6dixo+nQoYPp27evefTRR82//vWvBjMuW7bM9OjRw0RGRporrrjCvPPOO3XmrnpbtWpVyOVuaNwfffTRkB73+rKPHj3a+dhtqGavL3+oz/mG8q9evTok8v/P//xPrf3ecsstDR6zDz74oJXs1TX2CaausW+p843t3MbUPV+GDx9uPvroI2OMnfkSiOz15X/mmWecNm7Mb/t8E6j8xhhz3333mW7dupmIiAjzne98xzzxxBOmsrKyUX17hRljjAAAANAsbeY9VQAAAIFEUQUAAGABRRUAAIAFFFUAAAAWUFQBAABYQFEFAABgAUUVAACABRRVAAAAFlBUAQAAWEBRBQAAYAFFFQAAgAUUVQAAABb8f58N2gQoK7TXAAAAAElFTkSuQmCC\n",
|
575 |
-
"text/plain": [
|
576 |
-
"<Figure size 640x480 with 1 Axes>"
|
577 |
-
]
|
578 |
-
},
|
579 |
-
"metadata": {},
|
580 |
-
"output_type": "display_data"
|
581 |
-
}
|
582 |
-
],
|
583 |
-
"source": [
|
584 |
-
"new_df.date.hist(bins=400)"
|
585 |
-
]
|
586 |
-
},
|
587 |
-
{
|
588 |
-
"cell_type": "code",
|
589 |
-
"execution_count": null,
|
590 |
-
"id": "1acf60dc",
|
591 |
-
"metadata": {},
|
592 |
-
"outputs": [],
|
593 |
-
"source": []
|
594 |
-
}
|
595 |
-
],
|
596 |
-
"metadata": {
|
597 |
-
"kernelspec": {
|
598 |
-
"display_name": "Python 3 (ipykernel)",
|
599 |
-
"language": "python",
|
600 |
-
"name": "python3"
|
601 |
-
},
|
602 |
-
"language_info": {
|
603 |
-
"codemirror_mode": {
|
604 |
-
"name": "ipython",
|
605 |
-
"version": 3
|
606 |
-
},
|
607 |
-
"file_extension": ".py",
|
608 |
-
"mimetype": "text/x-python",
|
609 |
-
"name": "python",
|
610 |
-
"nbconvert_exporter": "python",
|
611 |
-
"pygments_lexer": "ipython3",
|
612 |
-
"version": "3.10.8"
|
613 |
-
}
|
614 |
-
},
|
615 |
-
"nbformat": 4,
|
616 |
-
"nbformat_minor": 5
|
617 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
praw==7.7.
|
2 |
-
gradio==3.
|
3 |
nbdev==2.3.12
|
4 |
-
datasets==2.
|
5 |
requests==2.28.2
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
-
|
9 |
-
|
|
|
1 |
+
praw==7.7.1
|
2 |
+
gradio==3.50.2
|
3 |
nbdev==2.3.12
|
4 |
+
datasets==2.14.6
|
5 |
requests==2.28.2
|
6 |
loguru==0.7.0
|
7 |
rich==13.3.4
|
8 |
+
supervisor==4.2.5
|
9 |
+
schedule==1.2.0
|
utilities/data_collator.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from utilities.praw_downloader import praw_downloader
|
4 |
+
from utilities.praw_processor import preprocess_praw_data
|
5 |
+
|
6 |
+
|
7 |
+
def get_latest_data():
|
8 |
+
submissions = praw_downloader()
|
9 |
+
df = preprocess_praw_data(submissions=submissions)
|
10 |
+
return df
|
11 |
+
|
12 |
+
|
13 |
+
def filter_redundant_ids(df: pd.DataFrame) -> pd.DataFrame:
|
14 |
+
"""
|
15 |
+
Removes rows with redundant ids, retaining the one with the longest content.
|
16 |
+
|
17 |
+
Parameters:
|
18 |
+
- df (pd.DataFrame): The input DataFrame with columns 'id' and 'content'.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
- pd.DataFrame: A filtered DataFrame with unique ids, where each id is associated
|
22 |
+
with the longest content available.
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Create a column for content length
|
26 |
+
df['content_length'] = df['content'].str.len()
|
27 |
+
|
28 |
+
# Use groupby to get the index of the row with the longest content for each 'id'
|
29 |
+
idx_to_keep = df.groupby('id')['content_length'].idxmax().values
|
30 |
+
|
31 |
+
# Filter the DataFrame to only keep those rows
|
32 |
+
df_filtered = df.loc[idx_to_keep]
|
33 |
+
|
34 |
+
# Drop the 'content_length' column
|
35 |
+
df_filtered = df_filtered.drop(columns=['content_length'])
|
36 |
+
|
37 |
+
return df_filtered
|
38 |
+
|
39 |
+
|
40 |
+
def merge_and_filter_data(old_df: pd.DataFrame) -> pd.DataFrame:
|
41 |
+
"""
|
42 |
+
Merges the provided dataset with the latest data, sorts them by 'date_utc',
|
43 |
+
filters out redundant IDs, and returns the merged and filtered dataset.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
- dataset (Type[Dataset]): The dataset to be merged with the latest data.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
- Type[Dataset]: The merged and filtered dataset.
|
50 |
+
"""
|
51 |
+
latest_df = get_latest_data()
|
52 |
+
|
53 |
+
df = pd.concat([old_df, latest_df], ignore_index=True).sort_values(by='date_utc').reset_index(drop=True)
|
54 |
+
df = filter_redundant_ids(df)
|
55 |
+
return df
|
utilities/my_logger.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
|
4 |
+
def setup_logger(name: str):
|
5 |
+
logger = logging.getLogger(name)
|
6 |
+
logger.setLevel(logging.DEBUG)
|
7 |
+
|
8 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
9 |
+
|
10 |
+
# Create a file handler to write logs to a file
|
11 |
+
file_handler = logging.FileHandler('mylog.log')
|
12 |
+
file_handler.setLevel(logging.DEBUG)
|
13 |
+
file_handler.setFormatter(formatter)
|
14 |
+
logger.addHandler(file_handler)
|
15 |
+
|
16 |
+
# Create a stream handler to write logs to the console
|
17 |
+
stream_handler = logging.StreamHandler()
|
18 |
+
stream_handler.setLevel(logging.DEBUG)
|
19 |
+
stream_handler.setFormatter(formatter)
|
20 |
+
logger.addHandler(stream_handler)
|
21 |
+
|
22 |
+
return logger
|
utilities/praw_downloader.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
from typing import Any, Dict, List
|
4 |
+
|
5 |
+
import praw
|
6 |
+
|
7 |
+
from utilities.my_logger import setup_logger
|
8 |
+
|
9 |
+
# Setup logging
|
10 |
+
logger = setup_logger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_reddit_instance() -> praw.Reddit:
|
14 |
+
"""Initialize and return a Reddit instance using PRAW."""
|
15 |
+
return praw.Reddit(
|
16 |
+
client_id=os.getenv('REDDIT_CLIENT_ID'),
|
17 |
+
client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
|
18 |
+
user_agent=os.getenv('REDDIT_USER_AGENT'),
|
19 |
+
ratelimit_seconds=20,
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def extract_submission_data(submission: praw.models.Submission) -> Dict[str, Any]:
|
24 |
+
"""Extract and return relevant data from a given Reddit submission."""
|
25 |
+
return {
|
26 |
+
"content": submission.selftext,
|
27 |
+
"poster": str(submission.author),
|
28 |
+
"date_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
|
29 |
+
"flair": submission.link_flair_text,
|
30 |
+
"title": submission.title,
|
31 |
+
"score": submission.ups,
|
32 |
+
"permalink": submission.permalink,
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def praw_downloader() -> List[Dict[str, str]]:
|
37 |
+
"""Main function to extract and save all submissions from the subreddit."""
|
38 |
+
reddit = get_reddit_instance()
|
39 |
+
subreddit = reddit.subreddit('bestofredditorupdates')
|
40 |
+
|
41 |
+
logger.info('Starting to fetch submissions from bestofredditorupdates.')
|
42 |
+
|
43 |
+
submissions = []
|
44 |
+
for submission in subreddit.new(limit=200): # Set limit=None to get all posts
|
45 |
+
logger.debug(f'Processing post {submission.id} - {submission.title}')
|
46 |
+
data = extract_submission_data(submission)
|
47 |
+
submissions.append(data)
|
48 |
+
|
49 |
+
logger.info(f'Finished downloading {len(submissions)} submissions.')
|
50 |
+
return submissions
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
praw_downloader()
|
utilities/praw_processor.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from utilities.my_logger import setup_logger
|
6 |
+
|
7 |
+
# Setup logging
|
8 |
+
logger = setup_logger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
def preprocess_praw_data(submissions: List[Dict]) -> pd.DataFrame:
|
12 |
+
"""
|
13 |
+
Preprocesses praw data into a DataFrame.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
- submissions: List of submission dictionaries.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
- pd.DataFrame: Preprocessed DataFrame.
|
20 |
+
"""
|
21 |
+
|
22 |
+
# Convert the submissions list to a DataFrame
|
23 |
+
praw_df = pd.DataFrame(submissions)
|
24 |
+
|
25 |
+
# Convert 'date' column to datetime format
|
26 |
+
praw_df.date_utc = pd.to_datetime(praw_df.date_utc)
|
27 |
+
|
28 |
+
# Remove 'poster_link' column if it exists
|
29 |
+
if 'poster_link' in praw_df.columns:
|
30 |
+
del praw_df['poster_link']
|
31 |
+
|
32 |
+
# Extract the 4th element from 'permalink' as 'id'
|
33 |
+
praw_df['id'] = praw_df.permalink.str.split('/').str[4]
|
34 |
+
|
35 |
+
return praw_df
|
utilities/readme_update.py
CHANGED
@@ -10,24 +10,20 @@ def get_readme_path(dataset_name):
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
-
def update_readme(dataset_name, subreddit,
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
|
|
|
|
|
|
|
|
|
|
16 |
# Dataset Name
|
17 |
{dataset_name}
|
18 |
|
19 |
## Update Frequency
|
20 |
-
The dataset is updated daily
|
21 |
-
|
22 |
-
## Dataset Overview
|
23 |
-
The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
|
24 |
-
|
25 |
-
## Data Collection
|
26 |
-
This has been collected with sequential calls that follow the pagination of the pushshift request.
|
27 |
-
|
28 |
-
## Attribution
|
29 |
-
Data sourced from the Pushshift API.
|
30 |
-
"""
|
31 |
|
32 |
append_readme(path=path, readme_text=readme_text)
|
33 |
return readme_text
|
|
|
10 |
return cached_path(readme_path, download_config=DownloadConfig())
|
11 |
|
12 |
|
13 |
+
def update_readme(dataset_name, subreddit, latest_date):
|
14 |
path = get_readme_path(dataset_name=dataset_name)
|
15 |
readme_text = f"""
|
16 |
+
## Dataset Overview
|
17 |
+
The goal is to have an open dataset of `{subreddit}` submissions. Im leveraging PRAW and the reddit API to get downloads.
|
18 |
+
|
19 |
+
There is a limit of 1000 in an API call and limited search functionality, so this is run every day to get new submissions.
|
20 |
+
|
21 |
# Dataset Name
|
22 |
{dataset_name}
|
23 |
|
24 |
## Update Frequency
|
25 |
+
The dataset is updated daily with the most recent day being: {latest_date}
|
26 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
append_readme(path=path, readme_text=readme_text)
|
29 |
return readme_text
|