|
from urlextract import URLExtract |
|
from wordcloud import WordCloud |
|
import pandas as pd |
|
from collections import Counter |
|
import emoji |
|
|
|
extract = URLExtract() |
|
|
|
def fetch_stats(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
num_messages = df.shape[0] |
|
words = [word for message in df['message'] for word in message.split()] |
|
num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0] |
|
links = [url for message in df['message'] for url in extract.find_urls(message)] |
|
|
|
return num_messages, len(words), num_media_messages, len(links) |
|
|
|
def most_busy_users(df): |
|
x = df['user'].value_counts().head() |
|
percent_df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(columns={'index': 'name', 'user': 'percent'}) |
|
return x, percent_df |
|
|
|
def create_wordcloud(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[temp['message'] != '<Media omitted>\n'] |
|
|
|
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') |
|
df_wc = wc.generate(temp['message'].str.cat(sep=" ")) |
|
return df_wc |
|
|
|
def most_common_words(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[temp['message'] != '<Media omitted>\n'] |
|
|
|
words = [word.lower() for message in temp['message'] for word in message.split()] |
|
most_common_df = pd.DataFrame(Counter(words).most_common(20)) |
|
return most_common_df |
|
|
|
def emoji_helper(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
emojis = [c for message in df['message'] for c in message if c in emoji.EMOJI_DATA] |
|
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) |
|
return emoji_df |
|
|
|
def monthly_timeline(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index() |
|
|
|
time = [f"{timeline['month'][i]}-{timeline['year'][i]}" for i in range(timeline.shape[0])] |
|
timeline['time'] = time |
|
return timeline |
|
|
|
def daily_timeline(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
daily_timeline = df.groupby('only_date').count()['message'].reset_index() |
|
return daily_timeline |
|
|
|
def week_activity_map(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
return df['day_name'].value_counts() |
|
|
|
def month_activity_map(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
return df['month'].value_counts() |
|
|
|
def activity_heatmap(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0) |
|
return user_heatmap |
|
|
|
def words_per_user_per_month(df): |
|
words_per_month = df.groupby(['user', 'year', 'month_num'])['message'].apply(lambda x: ' '.join(x)).reset_index() |
|
words_per_month['word_count'] = words_per_month['message'].apply(lambda x: len(x.split())) |
|
words_per_month_df = words_per_month.pivot(index=['year', 'month_num'], columns='user', values='word_count').fillna(0).astype(int) |
|
return words_per_month_df |
|
|
|
def frequent_hours(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
frequent_hours_df = df['hour'].value_counts().sort_index() |
|
return frequent_hours_df |
|
|
|
def common_words_by_four_hours(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[temp['message'] != '<Media omitted>\n'] |
|
|
|
common_words_by_hour = {} |
|
for hour in range(0, 24, 4): |
|
period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)] |
|
words = [word.lower() for message in period['message'] for word in message.split()] |
|
common_words_by_hour[f"{hour}-{hour + 4}"] = Counter(words).most_common(10) |
|
|
|
common_words_by_hour_df = pd.DataFrame.from_dict(common_words_by_hour, orient='index').fillna('').astype(str) |
|
return common_words_by_hour_df |
|
|
|
def create_wordcloud_by_four_hours(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[temp['message'] != '<Media omitted>\n'] |
|
|
|
wordclouds = {} |
|
for hour in range(0, 24, 4): |
|
period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)] |
|
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') |
|
wc_img = wc.generate(period['message'].str.cat(sep=" ")) |
|
wordclouds[f"{hour}-{hour + 4}"] = wc_img |
|
|
|
return wordclouds |
|
|
|
def common_words_by_month(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[temp['message'] != '<Media omitted>\n'] |
|
|
|
common_words_by_month = {} |
|
for month in df['month_num'].unique(): |
|
monthly_messages = temp[temp['month_num'] == month] |
|
words = [word.lower() for message in monthly_messages['message'] for word in message.split()] |
|
common_words_by_month[month] = Counter(words).most_common(10) |
|
|
|
common_words_by_month_df = pd.DataFrame.from_dict(common_words_by_month, orient='index').fillna('').astype(str) |
|
return common_words_by_month_df |
|
|