Commit
•
749d1d8
1
Parent(s):
19a5703
Init commit
Browse files- .gitignore +2 -0
- Dockerfile +34 -0
- app.py +24 -0
- archive/subreddit_downloader.py +145 -0
- main.py +145 -0
- my_logger.py +22 -0
- notebooks/explore.ipynb +323 -0
- requirements.txt +9 -0
- supervisord.conf +20 -0
- utilities/pushshift_data.py +162 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.idea/
|
2 |
+
notebooks/.ipynb_checkpoints
|
Dockerfile
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python base image
|
2 |
+
FROM python:3.9
|
3 |
+
|
4 |
+
# Install Git LFS
|
5 |
+
RUN apt-get update && apt-get install -y git-lfs
|
6 |
+
|
7 |
+
# Set the working directory
|
8 |
+
WORKDIR /app
|
9 |
+
|
10 |
+
# Copy requirements.txt into the container
|
11 |
+
COPY requirements.txt .
|
12 |
+
|
13 |
+
# Install the required packages
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Set the git credential helper to "store"
|
17 |
+
RUN git config --global credential.helper store
|
18 |
+
|
19 |
+
# Copy the rest of the application files into the container
|
20 |
+
COPY . .
|
21 |
+
|
22 |
+
# Set environment variables (Replace with your actual values)
|
23 |
+
ENV HUGGINGFACE_AUTH_TOKEN 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
|
24 |
+
ENV SUBREDDIT askreddit
|
25 |
+
ENV START_DATE 2013-01-01
|
26 |
+
|
27 |
+
# Copy supervisord.conf into the container
|
28 |
+
COPY supervisord.conf .
|
29 |
+
|
30 |
+
# Expose the desired port
|
31 |
+
EXPOSE 7860
|
32 |
+
|
33 |
+
# Run supervisord
|
34 |
+
CMD ["supervisord", "-c", "supervisord.conf"]
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rich.console import Console
|
3 |
+
from rich.syntax import Syntax
|
4 |
+
|
5 |
+
|
6 |
+
def log_file_to_html_string():
|
7 |
+
log_file = "mylog.log"
|
8 |
+
|
9 |
+
console = Console(record=True, width=150)
|
10 |
+
with open(log_file, "rt") as f:
|
11 |
+
syntax = Syntax(f.read(), "python", theme="monokai", word_wrap=True)
|
12 |
+
|
13 |
+
console.print(syntax)
|
14 |
+
html_content = console.export_html(inline_styles=True)
|
15 |
+
|
16 |
+
return html_content
|
17 |
+
|
18 |
+
|
19 |
+
with gr.Blocks() as demo:
|
20 |
+
name = gr.Markdown("# Reddit Scraper")
|
21 |
+
output = gr.HTML(log_file_to_html_string, every=1)
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
demo.launch(server_name="0.0.0.0", show_error=True, server_port=7860, enable_queue=True)
|
archive/subreddit_downloader.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import json
|
3 |
+
import sys
|
4 |
+
import time
|
5 |
+
import traceback
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
import requests
|
9 |
+
|
10 |
+
username = "" # put the username you want to download in the quotes
|
11 |
+
subreddit = "BestofRedditorUpdates" # put the subreddit you want to download in the quotes
|
12 |
+
thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
|
13 |
+
# leave either one blank to download an entire user's or subreddit's history
|
14 |
+
# or fill in both to download a specific users history from a specific subreddit
|
15 |
+
|
16 |
+
# change this to one of "human", "csv" or "json"
|
17 |
+
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
|
18 |
+
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
|
19 |
+
# - json: the full json object
|
20 |
+
output_format = "csv"
|
21 |
+
|
22 |
+
# default start time is the current time and default end time is all history
|
23 |
+
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
|
24 |
+
# start_time = datetime.utcnow() # datetime.strptime("10/05/2021", "%m/%d/%Y")
|
25 |
+
start_time = datetime.strptime("04/02/2023", "%m/%d/%Y")
|
26 |
+
end_time = None # datetime.strptime("09/25/2021", "%m/%d/%Y")
|
27 |
+
|
28 |
+
convert_to_ascii = False # don't touch this unless you know what you're doing
|
29 |
+
convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
|
30 |
+
|
31 |
+
|
32 |
+
def write_csv_line(writer, obj, is_submission):
|
33 |
+
output_list = []
|
34 |
+
output_list.append(str(obj['score']))
|
35 |
+
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
|
36 |
+
if is_submission:
|
37 |
+
output_list.append(obj['title'])
|
38 |
+
output_list.append(f"u/{obj['author']}")
|
39 |
+
output_list.append(f"https://www.reddit.com{obj['permalink']}")
|
40 |
+
if is_submission:
|
41 |
+
if obj['is_self']:
|
42 |
+
if 'selftext' in obj:
|
43 |
+
output_list.append(obj['selftext'])
|
44 |
+
else:
|
45 |
+
output_list.append("")
|
46 |
+
else:
|
47 |
+
output_list.append(obj['url'])
|
48 |
+
else:
|
49 |
+
output_list.append(obj['body'])
|
50 |
+
writer.writerow(output_list)
|
51 |
+
|
52 |
+
|
53 |
+
def write_json_line(handle, obj):
|
54 |
+
handle.write(json.dumps(obj))
|
55 |
+
handle.write("\n")
|
56 |
+
|
57 |
+
|
58 |
+
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
|
59 |
+
print(f"Saving to {filename}")
|
60 |
+
|
61 |
+
count = 0
|
62 |
+
if output_format == "human" or output_format == "json":
|
63 |
+
if convert_to_ascii:
|
64 |
+
handle = open(filename, 'w', encoding='ascii')
|
65 |
+
else:
|
66 |
+
handle = open(filename, 'w', encoding='UTF-8')
|
67 |
+
else:
|
68 |
+
handle = open(filename, 'w', encoding='UTF-8', newline='')
|
69 |
+
writer = csv.writer(handle)
|
70 |
+
|
71 |
+
previous_epoch = int(start_datetime.timestamp())
|
72 |
+
break_out = False
|
73 |
+
while True:
|
74 |
+
new_url = url_base + str(previous_epoch)
|
75 |
+
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
|
76 |
+
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
|
77 |
+
try:
|
78 |
+
json_data = json_text.json()
|
79 |
+
except json.decoder.JSONDecodeError:
|
80 |
+
time.sleep(1)
|
81 |
+
continue
|
82 |
+
|
83 |
+
if 'data' not in json_data:
|
84 |
+
break
|
85 |
+
objects = json_data['data']
|
86 |
+
if len(objects) == 0:
|
87 |
+
break
|
88 |
+
|
89 |
+
for obj in objects:
|
90 |
+
previous_epoch = obj['created_utc'] - 1
|
91 |
+
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
|
92 |
+
break_out = True
|
93 |
+
break
|
94 |
+
count += 1
|
95 |
+
try:
|
96 |
+
if output_format == "csv":
|
97 |
+
write_csv_line(writer, obj, is_submission)
|
98 |
+
elif output_format == "json":
|
99 |
+
write_json_line(handle, obj)
|
100 |
+
except Exception as err:
|
101 |
+
if 'permalink' in obj:
|
102 |
+
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
|
103 |
+
else:
|
104 |
+
print(f"Couldn't print object, missing permalink: {obj['id']}")
|
105 |
+
print(err)
|
106 |
+
print(traceback.format_exc())
|
107 |
+
|
108 |
+
if break_out:
|
109 |
+
break
|
110 |
+
|
111 |
+
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
|
112 |
+
|
113 |
+
print(f"Saved {count}")
|
114 |
+
handle.close()
|
115 |
+
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
filter_string = None
|
119 |
+
if username == "" and subreddit == "" and thread_id == "":
|
120 |
+
print("Fill in username, subreddit or thread id")
|
121 |
+
sys.exit(0)
|
122 |
+
if output_format not in ("human", "csv", "json"):
|
123 |
+
print("Output format must be one of human, csv, json")
|
124 |
+
sys.exit(0)
|
125 |
+
|
126 |
+
filters = []
|
127 |
+
if username:
|
128 |
+
filters.append(f"author={username}")
|
129 |
+
if subreddit:
|
130 |
+
filters.append(f"subreddit={subreddit}")
|
131 |
+
if thread_id:
|
132 |
+
if convert_thread_id_to_base_ten:
|
133 |
+
filters.append(f"link_id={int(thread_id, 36)}")
|
134 |
+
else:
|
135 |
+
filters.append(f"link_id=t3_{thread_id}")
|
136 |
+
filter_string = '&'.join(filters)
|
137 |
+
|
138 |
+
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
|
139 |
+
|
140 |
+
if not thread_id:
|
141 |
+
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time,
|
142 |
+
end_time, True, convert_to_ascii)
|
143 |
+
# download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time,
|
144 |
+
# end_time, False, convert_to_ascii)
|
145 |
+
|
main.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
7 |
+
from huggingface_hub import login
|
8 |
+
|
9 |
+
from my_logger import setup_logger
|
10 |
+
from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe
|
11 |
+
|
12 |
+
# Set dataset name, path to README.md, and existing dataset details
|
13 |
+
dataset_name = "derek-thomas/askreddit_test"
|
14 |
+
dataset_readme_path = "README.md"
|
15 |
+
subreddit = os.environ["SUBREDDIT"]
|
16 |
+
|
17 |
+
# Authenticate with Hugging Face using an auth token
|
18 |
+
auth_token = os.environ["HUGGINGFACE_AUTH_TOKEN"]
|
19 |
+
login(auth_token, add_to_git_credential=True)
|
20 |
+
|
21 |
+
logger = setup_logger(__name__)
|
22 |
+
|
23 |
+
|
24 |
+
def update_readme(dataset_name, subreddit, date_to_fetch):
|
25 |
+
readme_text = f"""
|
26 |
+
# {dataset_name}
|
27 |
+
|
28 |
+
## Dataset Overview
|
29 |
+
The goal is to have an open dataset of `{subreddit}` submissions. This has been taken from the Pushshift API.
|
30 |
+
|
31 |
+
## Data Collection
|
32 |
+
This has been collected with sequential calls that follow the pagination of the pushshift request.
|
33 |
+
|
34 |
+
|
35 |
+
## Data Structure
|
36 |
+
- `all_days`: All the data after `{os.environ["START_DATE"]}`
|
37 |
+
|
38 |
+
## Update Frequency
|
39 |
+
The dataset is updated daily and covers the period from `{os.environ["START_DATE"]}` to two days ago.
|
40 |
+
|
41 |
+
## Attribution
|
42 |
+
Data sourced from the Pushshift API.
|
43 |
+
|
44 |
+
## Change Log
|
45 |
+
<details>
|
46 |
+
<summary>Click to expand</summary>
|
47 |
+
|
48 |
+
- **{datetime.now().strftime('%Y-%m-%d')}:** Added data for {date_to_fetch} to the 'all_days' split and saved as CSV
|
49 |
+
|
50 |
+
</details>
|
51 |
+
"""
|
52 |
+
|
53 |
+
return readme_text
|
54 |
+
|
55 |
+
|
56 |
+
def main(date_to_fetch):
|
57 |
+
"""
|
58 |
+
Runs the main data processing function to fetch and process subreddit data for the specified date.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
date_to_fetch (str): The date to fetch subreddit data for, in the format "YYYY-MM-DD".
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
most_recent_date (str): Most recent date in dataset
|
65 |
+
"""
|
66 |
+
|
67 |
+
# Load the existing dataset from the Hugging Face hub or create a new one
|
68 |
+
try:
|
69 |
+
logger.info("Loading existing dataset")
|
70 |
+
dataset = load_dataset(dataset_name)
|
71 |
+
if "__index_level_0__" in dataset["all_days"].column_names:
|
72 |
+
dataset = dataset.remove_columns(["__index_level_0__"])
|
73 |
+
except FileNotFoundError:
|
74 |
+
logger.info("Creating new dataset")
|
75 |
+
dataset = DatasetDict()
|
76 |
+
|
77 |
+
# Call get_subreddit_day with the calculated date
|
78 |
+
logger.info(f"Fetching data for {date_to_fetch}")
|
79 |
+
submissions = scrape_submissions_by_day(subreddit, date_to_fetch)
|
80 |
+
df = submissions_to_dataframe(submissions)
|
81 |
+
logger.info(f"Data fetched for {date_to_fetch}")
|
82 |
+
most_recent_date = datetime.strptime(date_to_fetch, '%Y-%m-%d').date()
|
83 |
+
|
84 |
+
# Append DataFrame to split 'all_days' or create new split
|
85 |
+
if "all_days" in dataset:
|
86 |
+
logger.info("Appending data to split 'all_days'")
|
87 |
+
# Merge the new submissions
|
88 |
+
old_data = dataset['all_days'].to_pandas()
|
89 |
+
new_data = pd.concat([old_data, df], ignore_index=True)
|
90 |
+
|
91 |
+
# Drop duplicates just in case
|
92 |
+
new_data = new_data.drop_duplicates(subset=['id'], keep="first")
|
93 |
+
new_data_most_recent_date_raw = new_data['created_utc'].max()
|
94 |
+
new_data_most_recent_date_dt = datetime.strptime(new_data_most_recent_date_raw.split(' ')[0], '%Y-%m-%d').date()
|
95 |
+
# Adding timedelta in case there is rounding error
|
96 |
+
most_recent_date = max(new_data_most_recent_date_dt - timedelta(days=1), most_recent_date)
|
97 |
+
|
98 |
+
# Convert back to dataset
|
99 |
+
dataset["all_days"] = Dataset.from_pandas(new_data)
|
100 |
+
else:
|
101 |
+
logger.info("Creating new split 'all_days'")
|
102 |
+
dataset["all_days"] = Dataset.from_pandas(df)
|
103 |
+
# Log appending or creating split 'all'
|
104 |
+
logger.info("Appended or created split 'all_days'")
|
105 |
+
|
106 |
+
# Push the augmented dataset to the Hugging Face hub
|
107 |
+
logger.info(f"Pushing data for {date_to_fetch} to the Hugging Face hub")
|
108 |
+
readme_text = update_readme(dataset_name, subreddit, date_to_fetch)
|
109 |
+
dataset.description = readme_text
|
110 |
+
dataset.push_to_hub(dataset_name, token=auth_token)
|
111 |
+
logger.info(f"Processed and pushed data for {date_to_fetch} to the Hugging Face Hub")
|
112 |
+
return most_recent_date
|
113 |
+
|
114 |
+
|
115 |
+
def run_main_continuously():
|
116 |
+
"""
|
117 |
+
This function runs the given `main_function` continuously, starting from the date specified
|
118 |
+
in the environment variable "START_DATE" until two days ago. Once it reaches two days ago,
|
119 |
+
it will wait until tomorrow to start again at the same time as when it started today.
|
120 |
+
"""
|
121 |
+
start_date_str = os.environ.get("START_DATE")
|
122 |
+
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
|
123 |
+
|
124 |
+
# Calculate the start time for running the main_function every day.
|
125 |
+
start_time = datetime.now().time()
|
126 |
+
|
127 |
+
while True:
|
128 |
+
today = datetime.now().date()
|
129 |
+
two_days_ago = today - timedelta(days=2)
|
130 |
+
|
131 |
+
if start_date <= two_days_ago:
|
132 |
+
logger.info(f"Running main function for date: {start_date}")
|
133 |
+
most_recent_date = main(str(start_date))
|
134 |
+
start_date = most_recent_date + timedelta(days=1)
|
135 |
+
else:
|
136 |
+
tomorrow = today + timedelta(days=1)
|
137 |
+
now = datetime.now()
|
138 |
+
start_of_tomorrow = datetime.combine(tomorrow, start_time)
|
139 |
+
wait_until_tomorrow = (start_of_tomorrow - now).total_seconds()
|
140 |
+
logger.info(f"Waiting until tomorrow: {wait_until_tomorrow} seconds")
|
141 |
+
time.sleep(wait_until_tomorrow)
|
142 |
+
|
143 |
+
|
144 |
+
if __name__ == '__main__':
|
145 |
+
run_main_continuously()
|
my_logger.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
|
4 |
+
def setup_logger(name: str):
|
5 |
+
logger = logging.getLogger(name)
|
6 |
+
logger.setLevel(logging.DEBUG)
|
7 |
+
|
8 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
9 |
+
|
10 |
+
# Create a file handler to write logs to a file
|
11 |
+
file_handler = logging.FileHandler('mylog.log')
|
12 |
+
file_handler.setLevel(logging.DEBUG)
|
13 |
+
file_handler.setFormatter(formatter)
|
14 |
+
logger.addHandler(file_handler)
|
15 |
+
|
16 |
+
# Create a stream handler to write logs to the console
|
17 |
+
stream_handler = logging.StreamHandler()
|
18 |
+
stream_handler.setLevel(logging.DEBUG)
|
19 |
+
stream_handler.setFormatter(formatter)
|
20 |
+
logger.addHandler(stream_handler)
|
21 |
+
|
22 |
+
return logger
|
notebooks/explore.ipynb
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "730ba509",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from IPython.core.interactiveshell import InteractiveShell\n",
|
11 |
+
"InteractiveShell.ast_node_interactivity = \"all\""
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": null,
|
17 |
+
"id": "d9acd4b6",
|
18 |
+
"metadata": {},
|
19 |
+
"outputs": [],
|
20 |
+
"source": [
|
21 |
+
"from pathlib import Path\n",
|
22 |
+
"import sys\n",
|
23 |
+
"proj_dir = Path.cwd().parent\n",
|
24 |
+
"\n",
|
25 |
+
"sys.path.append(str(proj_dir))\n"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": null,
|
31 |
+
"id": "62452860",
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"from utilities.pushshift_data import scrape_submissions_by_day, submissions_to_dataframe, get_post_count_for_day"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 4,
|
41 |
+
"id": "a956a623",
|
42 |
+
"metadata": {},
|
43 |
+
"outputs": [
|
44 |
+
{
|
45 |
+
"data": {
|
46 |
+
"application/vnd.jupyter.widget-view+json": {
|
47 |
+
"model_id": "17df3f2812084d3591e914ffcfd948b0",
|
48 |
+
"version_major": 2,
|
49 |
+
"version_minor": 0
|
50 |
+
},
|
51 |
+
"text/plain": [
|
52 |
+
"0it [00:00, ?it/s]"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
"metadata": {},
|
56 |
+
"output_type": "display_data"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"name": "stderr",
|
60 |
+
"output_type": "stream",
|
61 |
+
"text": [
|
62 |
+
"2023-04-12 16:23:59,392 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 20:00:00\n",
|
63 |
+
"2023-04-12 16:24:03,524 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 14:37:16\n",
|
64 |
+
"2023-04-12 16:24:08,443 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 05:02:52\n",
|
65 |
+
"2023-04-12 16:24:13,409 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-03-01 00:43:35\n",
|
66 |
+
"2023-04-12 16:24:17,548 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:28:35\n",
|
67 |
+
"2023-04-12 16:24:21,490 - INFO - Fetching data between timestamps 2013-02-28 20:00:00 and 2013-02-28 20:00:48\n",
|
68 |
+
"2023-04-12 16:24:23,658 - INFO - Finished scraping 4106 submissions in 28.86 seconds\n"
|
69 |
+
]
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"source": [
|
73 |
+
"subreddit_to_scrape = \"askreddit\"\n",
|
74 |
+
"day_to_scrape = \"2013-03-01\"\n",
|
75 |
+
"submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 5,
|
81 |
+
"id": "b1cc845b",
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [
|
84 |
+
{
|
85 |
+
"data": {
|
86 |
+
"text/html": [
|
87 |
+
"<div>\n",
|
88 |
+
"<style scoped>\n",
|
89 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
90 |
+
" vertical-align: middle;\n",
|
91 |
+
" }\n",
|
92 |
+
"\n",
|
93 |
+
" .dataframe tbody tr th {\n",
|
94 |
+
" vertical-align: top;\n",
|
95 |
+
" }\n",
|
96 |
+
"\n",
|
97 |
+
" .dataframe thead th {\n",
|
98 |
+
" text-align: right;\n",
|
99 |
+
" }\n",
|
100 |
+
"</style>\n",
|
101 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
102 |
+
" <thead>\n",
|
103 |
+
" <tr style=\"text-align: right;\">\n",
|
104 |
+
" <th></th>\n",
|
105 |
+
" <th>permalink</th>\n",
|
106 |
+
" <th>selftext</th>\n",
|
107 |
+
" <th>url</th>\n",
|
108 |
+
" <th>created_utc</th>\n",
|
109 |
+
" <th>author</th>\n",
|
110 |
+
" <th>num_comments</th>\n",
|
111 |
+
" <th>score</th>\n",
|
112 |
+
" <th>title</th>\n",
|
113 |
+
" <th>id</th>\n",
|
114 |
+
" <th>downs</th>\n",
|
115 |
+
" <th>ups</th>\n",
|
116 |
+
" </tr>\n",
|
117 |
+
" </thead>\n",
|
118 |
+
" <tbody>\n",
|
119 |
+
" <tr>\n",
|
120 |
+
" <th>0</th>\n",
|
121 |
+
" <td>/r/AskReddit/comments/19hbm0/in_the_way_that_p...</td>\n",
|
122 |
+
" <td>Basically, do other parts of the world have th...</td>\n",
|
123 |
+
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
124 |
+
" <td>2013-03-01 19:58:55</td>\n",
|
125 |
+
" <td>sjr63</td>\n",
|
126 |
+
" <td>1</td>\n",
|
127 |
+
" <td>1</td>\n",
|
128 |
+
" <td>In the way that popular English and American m...</td>\n",
|
129 |
+
" <td>19hbm0</td>\n",
|
130 |
+
" <td>0</td>\n",
|
131 |
+
" <td>1</td>\n",
|
132 |
+
" </tr>\n",
|
133 |
+
" <tr>\n",
|
134 |
+
" <th>1</th>\n",
|
135 |
+
" <td>/r/AskReddit/comments/19hblp/could_i_buy_an_an...</td>\n",
|
136 |
+
" <td></td>\n",
|
137 |
+
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
138 |
+
" <td>2013-03-01 19:58:50</td>\n",
|
139 |
+
" <td>WeirdPlane</td>\n",
|
140 |
+
" <td>13</td>\n",
|
141 |
+
" <td>1</td>\n",
|
142 |
+
" <td>Could I buy an Android phone without a plan an...</td>\n",
|
143 |
+
" <td>19hblp</td>\n",
|
144 |
+
" <td>0</td>\n",
|
145 |
+
" <td>1</td>\n",
|
146 |
+
" </tr>\n",
|
147 |
+
" <tr>\n",
|
148 |
+
" <th>2</th>\n",
|
149 |
+
" <td>/r/AskReddit/comments/19hblj/how_do_i_reddit/</td>\n",
|
150 |
+
" <td>Yeah.\n",
|
151 |
+
"\n",
|
152 |
+
"How do I reddit? I don't use or read re...</td>\n",
|
153 |
+
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
154 |
+
" <td>2013-03-01 19:58:47</td>\n",
|
155 |
+
" <td>xxnovaroxgg</td>\n",
|
156 |
+
" <td>14</td>\n",
|
157 |
+
" <td>0</td>\n",
|
158 |
+
" <td>How do I reddit</td>\n",
|
159 |
+
" <td>19hblj</td>\n",
|
160 |
+
" <td>0</td>\n",
|
161 |
+
" <td>0</td>\n",
|
162 |
+
" </tr>\n",
|
163 |
+
" <tr>\n",
|
164 |
+
" <th>3</th>\n",
|
165 |
+
" <td>/r/AskReddit/comments/19hbjx/xpost_rsurvival_h...</td>\n",
|
166 |
+
" <td>My brothers, dad and I have always been huge L...</td>\n",
|
167 |
+
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
168 |
+
" <td>2013-03-01 19:58:07</td>\n",
|
169 |
+
" <td>tuffstough</td>\n",
|
170 |
+
" <td>0</td>\n",
|
171 |
+
" <td>1</td>\n",
|
172 |
+
" <td>(x-post r/survival) Have any redditors seen Le...</td>\n",
|
173 |
+
" <td>19hbjx</td>\n",
|
174 |
+
" <td>0</td>\n",
|
175 |
+
" <td>1</td>\n",
|
176 |
+
" </tr>\n",
|
177 |
+
" <tr>\n",
|
178 |
+
" <th>4</th>\n",
|
179 |
+
" <td>/r/AskReddit/comments/19hbjk/female_redditors_...</td>\n",
|
180 |
+
" <td>I'm curious, guys tend to get asked the usual ...</td>\n",
|
181 |
+
" <td>http://www.reddit.com/r/AskReddit/comments/19h...</td>\n",
|
182 |
+
" <td>2013-03-01 19:57:58</td>\n",
|
183 |
+
" <td>redditredditx3</td>\n",
|
184 |
+
" <td>13</td>\n",
|
185 |
+
" <td>2</td>\n",
|
186 |
+
" <td>Female Redditors, which part of the male physi...</td>\n",
|
187 |
+
" <td>19hbjk</td>\n",
|
188 |
+
" <td>0</td>\n",
|
189 |
+
" <td>2</td>\n",
|
190 |
+
" </tr>\n",
|
191 |
+
" </tbody>\n",
|
192 |
+
"</table>\n",
|
193 |
+
"</div>"
|
194 |
+
],
|
195 |
+
"text/plain": [
|
196 |
+
" permalink \\\n",
|
197 |
+
"0 /r/AskReddit/comments/19hbm0/in_the_way_that_p... \n",
|
198 |
+
"1 /r/AskReddit/comments/19hblp/could_i_buy_an_an... \n",
|
199 |
+
"2 /r/AskReddit/comments/19hblj/how_do_i_reddit/ \n",
|
200 |
+
"3 /r/AskReddit/comments/19hbjx/xpost_rsurvival_h... \n",
|
201 |
+
"4 /r/AskReddit/comments/19hbjk/female_redditors_... \n",
|
202 |
+
"\n",
|
203 |
+
" selftext \\\n",
|
204 |
+
"0 Basically, do other parts of the world have th... \n",
|
205 |
+
"1 \n",
|
206 |
+
"2 Yeah.\n",
|
207 |
+
"\n",
|
208 |
+
"How do I reddit? I don't use or read re... \n",
|
209 |
+
"3 My brothers, dad and I have always been huge L... \n",
|
210 |
+
"4 I'm curious, guys tend to get asked the usual ... \n",
|
211 |
+
"\n",
|
212 |
+
" url created_utc \\\n",
|
213 |
+
"0 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:55 \n",
|
214 |
+
"1 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:50 \n",
|
215 |
+
"2 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:47 \n",
|
216 |
+
"3 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:58:07 \n",
|
217 |
+
"4 http://www.reddit.com/r/AskReddit/comments/19h... 2013-03-01 19:57:58 \n",
|
218 |
+
"\n",
|
219 |
+
" author num_comments score \\\n",
|
220 |
+
"0 sjr63 1 1 \n",
|
221 |
+
"1 WeirdPlane 13 1 \n",
|
222 |
+
"2 xxnovaroxgg 14 0 \n",
|
223 |
+
"3 tuffstough 0 1 \n",
|
224 |
+
"4 redditredditx3 13 2 \n",
|
225 |
+
"\n",
|
226 |
+
" title id downs ups \n",
|
227 |
+
"0 In the way that popular English and American m... 19hbm0 0 1 \n",
|
228 |
+
"1 Could I buy an Android phone without a plan an... 19hblp 0 1 \n",
|
229 |
+
"2 How do I reddit 19hblj 0 0 \n",
|
230 |
+
"3 (x-post r/survival) Have any redditors seen Le... 19hbjx 0 1 \n",
|
231 |
+
"4 Female Redditors, which part of the male physi... 19hbjk 0 2 "
|
232 |
+
]
|
233 |
+
},
|
234 |
+
"execution_count": 5,
|
235 |
+
"metadata": {},
|
236 |
+
"output_type": "execute_result"
|
237 |
+
}
|
238 |
+
],
|
239 |
+
"source": [
|
240 |
+
"df = submissions_to_dataframe(submissions)\n",
|
241 |
+
"df.head()"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"cell_type": "code",
|
246 |
+
"execution_count": null,
|
247 |
+
"id": "518addff",
|
248 |
+
"metadata": {},
|
249 |
+
"outputs": [],
|
250 |
+
"source": []
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"cell_type": "code",
|
254 |
+
"execution_count": null,
|
255 |
+
"id": "6e5490dc",
|
256 |
+
"metadata": {},
|
257 |
+
"outputs": [],
|
258 |
+
"source": [
|
259 |
+
"start_date = datetime.strptime(\"2013-01-01\", \"%Y-%m-%d\")\n",
|
260 |
+
"start_date"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"cell_type": "code",
|
265 |
+
"execution_count": null,
|
266 |
+
"id": "bf13555a",
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [],
|
269 |
+
"source": [
|
270 |
+
"df[\"created_utc\"] = pd.to_datetime(df[\"created_utc\"], unit=\"s\").dt.tz_localize(\"UTC\").dt.strftime('%Y-%m-%d %H:%M:%S')"
|
271 |
+
]
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"cell_type": "code",
|
275 |
+
"execution_count": null,
|
276 |
+
"id": "48e413f3",
|
277 |
+
"metadata": {},
|
278 |
+
"outputs": [],
|
279 |
+
"source": [
|
280 |
+
"df.head()"
|
281 |
+
]
|
282 |
+
},
|
283 |
+
{
|
284 |
+
"cell_type": "code",
|
285 |
+
"execution_count": null,
|
286 |
+
"id": "9e83befa",
|
287 |
+
"metadata": {},
|
288 |
+
"outputs": [],
|
289 |
+
"source": [
|
290 |
+
"df.dtypes"
|
291 |
+
]
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"cell_type": "code",
|
295 |
+
"execution_count": null,
|
296 |
+
"id": "ba84be68",
|
297 |
+
"metadata": {},
|
298 |
+
"outputs": [],
|
299 |
+
"source": []
|
300 |
+
}
|
301 |
+
],
|
302 |
+
"metadata": {
|
303 |
+
"kernelspec": {
|
304 |
+
"display_name": "Python 3 (ipykernel)",
|
305 |
+
"language": "python",
|
306 |
+
"name": "python3"
|
307 |
+
},
|
308 |
+
"language_info": {
|
309 |
+
"codemirror_mode": {
|
310 |
+
"name": "ipython",
|
311 |
+
"version": 3
|
312 |
+
},
|
313 |
+
"file_extension": ".py",
|
314 |
+
"mimetype": "text/x-python",
|
315 |
+
"name": "python",
|
316 |
+
"nbconvert_exporter": "python",
|
317 |
+
"pygments_lexer": "ipython3",
|
318 |
+
"version": "3.9.16"
|
319 |
+
}
|
320 |
+
},
|
321 |
+
"nbformat": 4,
|
322 |
+
"nbformat_minor": 5
|
323 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
praw==7.7.0
|
2 |
+
gradio==3.23
|
3 |
+
nbdev==2.3.12
|
4 |
+
datasets==2.11.0
|
5 |
+
requests==2.28.2
|
6 |
+
loguru==0.7.0
|
7 |
+
rich==13.3.4
|
8 |
+
gradio==3.23.0
|
9 |
+
supervisor==4.2.5
|
supervisord.conf
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[supervisord]
|
2 |
+
nodaemon=true
|
3 |
+
|
4 |
+
[program:main]
|
5 |
+
command=python main.py
|
6 |
+
stdout_logfile=/dev/stdout
|
7 |
+
stdout_logfile_maxbytes=0
|
8 |
+
stderr_logfile=/dev/stderr
|
9 |
+
stderr_logfile_maxbytes=0
|
10 |
+
autostart=true
|
11 |
+
# autorestart=true
|
12 |
+
|
13 |
+
[program:app]
|
14 |
+
command=python app.py
|
15 |
+
stdout_logfile=/dev/null
|
16 |
+
stdout_logfile_maxbytes=0
|
17 |
+
stderr_logfile=/dev/stderr
|
18 |
+
stderr_logfile_maxbytes=0
|
19 |
+
autostart=true
|
20 |
+
autorestart=true
|
utilities/pushshift_data.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from datetime import datetime, timedelta, timezone
|
3 |
+
from typing import Any, Dict, List, Optional
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
import requests
|
7 |
+
|
8 |
+
from my_logger import setup_logger
|
9 |
+
|
10 |
+
logger = setup_logger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
def get_pushshift_data(subreddit: str, before: Optional[int] = None,
|
14 |
+
after: Optional[int] = None, aggs: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
15 |
+
"""
|
16 |
+
Fetch data from the Pushshift API for the specified subreddit.
|
17 |
+
|
18 |
+
:param subreddit: The name of the subreddit to scrape.
|
19 |
+
:param before: The upper limit for the created_utc attribute of the submissions.
|
20 |
+
:param after: The lower limit for the created_utc attribute of the submissions.
|
21 |
+
:param aggs: The aggregation summary option to use.
|
22 |
+
:return: A dictionary containing the fetched data and aggregations if available.
|
23 |
+
"""
|
24 |
+
url = "https://api.pushshift.io/reddit/search/submission/"
|
25 |
+
params = {
|
26 |
+
"subreddit": subreddit,
|
27 |
+
"size": 1000,
|
28 |
+
"sort": "created_utc",
|
29 |
+
"sort_type": "desc",
|
30 |
+
}
|
31 |
+
if before is not None:
|
32 |
+
params["before"] = before
|
33 |
+
if after is not None:
|
34 |
+
params["after"] = after
|
35 |
+
if aggs is not None:
|
36 |
+
params["aggs"] = aggs
|
37 |
+
|
38 |
+
response = requests.get(url, params=params)
|
39 |
+
if response.status_code == 200:
|
40 |
+
return response.json()
|
41 |
+
else:
|
42 |
+
logger.error(f"Error fetching data: {response.status_code}")
|
43 |
+
return None
|
44 |
+
|
45 |
+
|
46 |
+
def get_post_count_for_day(subreddit: str, day_to_scrape: str) -> int:
|
47 |
+
"""
|
48 |
+
Get the total number of posts for a specific day in the specified subreddit using the Pushshift API.
|
49 |
+
|
50 |
+
:param subreddit: The name of the subreddit to get the post count for.
|
51 |
+
:param day_to_scrape: The date for which to get the post count (format: "YYYY-MM-DD").
|
52 |
+
:return: The total number of posts for the specified day.
|
53 |
+
"""
|
54 |
+
date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
|
55 |
+
after = int(date_obj.timestamp())
|
56 |
+
before = int((date_obj + timedelta(days=1)).timestamp())
|
57 |
+
|
58 |
+
response = get_pushshift_data(subreddit, before=before, after=after, aggs="created_utc")
|
59 |
+
if response is not None:
|
60 |
+
aggs = response.get("aggs", {}).get("created_utc", [])
|
61 |
+
if aggs:
|
62 |
+
return aggs[0]["doc_count"]
|
63 |
+
return 0
|
64 |
+
|
65 |
+
|
66 |
+
def fetch_data(subreddit: str, before: int, after: int) -> Optional[Dict[str, Any]]:
|
67 |
+
url = "https://api.pushshift.io/reddit/search/submission/"
|
68 |
+
params = {
|
69 |
+
"subreddit": subreddit,
|
70 |
+
"size": 1000,
|
71 |
+
"sort": "created_utc",
|
72 |
+
"sort_type": "desc",
|
73 |
+
"before": before,
|
74 |
+
"after": after,
|
75 |
+
}
|
76 |
+
|
77 |
+
response = requests.get(url, params=params)
|
78 |
+
if response.status_code == 200:
|
79 |
+
return response.json()
|
80 |
+
else:
|
81 |
+
logger.error(f"Error fetching data: {response.status_code}")
|
82 |
+
return None
|
83 |
+
|
84 |
+
|
85 |
+
def convert_timestamp_to_datetime(timestamp: int) -> str:
|
86 |
+
# Convert the timestamp to a datetime object
|
87 |
+
datetime_obj = datetime.utcfromtimestamp(timestamp)
|
88 |
+
|
89 |
+
# Add timezone information
|
90 |
+
datetime_obj_utc = datetime_obj.replace(tzinfo=timezone.utc)
|
91 |
+
|
92 |
+
# Convert the datetime object to a formatted string
|
93 |
+
datetime_str = datetime_obj_utc.strftime('%Y-%m-%d %H:%M:%S')
|
94 |
+
|
95 |
+
return datetime_str
|
96 |
+
|
97 |
+
|
98 |
+
def scrape_submissions_by_day(subreddit_to_scrape: str, day_to_scrape: str) -> List[Dict[str, Any]]:
|
99 |
+
start_time = time.time()
|
100 |
+
scraped_submissions = []
|
101 |
+
date_obj = datetime.strptime(day_to_scrape, "%Y-%m-%d")
|
102 |
+
|
103 |
+
if date_obj > datetime.now() - timedelta(days=7):
|
104 |
+
logger.error("The specified date might not be available in the Pushshift API yet. "
|
105 |
+
"Please try an earlier date or wait for the API to be updated.")
|
106 |
+
return scraped_submissions
|
107 |
+
|
108 |
+
after = int(date_obj.timestamp())
|
109 |
+
before = int((date_obj + timedelta(days=1)).timestamp())
|
110 |
+
|
111 |
+
# todo get_post_count_for_day didnt seem to work
|
112 |
+
# post_count = get_post_count_for_day(subreddit_to_scrape, day_to_scrape)
|
113 |
+
# total_requests = (post_count + 99) // 100 # Estimate the total number of requests
|
114 |
+
|
115 |
+
actual_requests = 0
|
116 |
+
while after < before:
|
117 |
+
after_str, before_str = convert_timestamp_to_datetime(after), convert_timestamp_to_datetime(before)
|
118 |
+
logger.info(f"Fetching data between timestamps {after_str} and {before_str}")
|
119 |
+
data = get_pushshift_data(subreddit_to_scrape, before=before, after=after)
|
120 |
+
if data is None or len(data["data"]) == 0:
|
121 |
+
break
|
122 |
+
|
123 |
+
scraped_submissions.extend(data["data"])
|
124 |
+
before = data["data"][-1]["created_utc"]
|
125 |
+
|
126 |
+
actual_requests += 1
|
127 |
+
time.sleep(1)
|
128 |
+
|
129 |
+
elapsed_time = time.time() - start_time
|
130 |
+
if actual_requests:
|
131 |
+
logger.info(
|
132 |
+
f"{actual_requests}it [{elapsed_time // 60:02}:{elapsed_time % 60:.2f} {elapsed_time / actual_requests:.2f}s/it]")
|
133 |
+
logger.info(
|
134 |
+
f"Finished scraping {len(scraped_submissions)} submissions in {elapsed_time:.2f} seconds in {actual_requests} requests")
|
135 |
+
return scraped_submissions
|
136 |
+
|
137 |
+
|
138 |
+
def submissions_to_dataframe(submissions: List[Dict[str, Any]]) -> pd.DataFrame:
|
139 |
+
"""
|
140 |
+
Parse a list of submissions into a pandas DataFrame.
|
141 |
+
|
142 |
+
:param submissions: A list of dictionaries containing the scraped submission data.
|
143 |
+
:return: A pandas DataFrame containing the submission data.
|
144 |
+
"""
|
145 |
+
cols = ['score', 'num_comments', 'title', 'permalink', 'selftext', 'url', 'created_utc', 'author', 'id',
|
146 |
+
'downs', 'ups']
|
147 |
+
df = pd.DataFrame(submissions)
|
148 |
+
df = df.convert_dtypes()
|
149 |
+
df = df[cols]
|
150 |
+
# Convert the "created_utc" column to a datetime column with timezone information
|
151 |
+
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').dt.tz_localize('UTC').dt.strftime(
|
152 |
+
'%Y-%m-%d %H:%M:%S')
|
153 |
+
return df
|
154 |
+
|
155 |
+
|
156 |
+
if __name__ == '__main__':
|
157 |
+
subreddit_to_scrape = "askreddit"
|
158 |
+
day_to_scrape = "2013-03-01"
|
159 |
+
submissions = scrape_submissions_by_day(subreddit_to_scrape, day_to_scrape)
|
160 |
+
df = submissions_to_dataframe(submissions)
|
161 |
+
print(df.head().to_string())
|
162 |
+
logger.info(f"Scraped {len(submissions)} submissions from r/{subreddit_to_scrape} on {day_to_scrape}")
|