Soufianesejjari's picture
my app
4f3e60d
raw
history blame
5.74 kB
from urlextract import URLExtract
from wordcloud import WordCloud
import pandas as pd
from collections import Counter
import emoji
extract = URLExtract()
def fetch_stats(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
num_messages = df.shape[0]
words = [word for message in df['message'] for word in message.split()]
num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]
links = [url for message in df['message'] for url in extract.find_urls(message)]
return num_messages, len(words), num_media_messages, len(links)
def most_busy_users(df):
x = df['user'].value_counts().head()
percent_df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(columns={'index': 'name', 'user': 'percent'})
return x, percent_df
def create_wordcloud(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
df_wc = wc.generate(temp['message'].str.cat(sep=" "))
return df_wc
def most_common_words(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
words = [word.lower() for message in temp['message'] for word in message.split()]
most_common_df = pd.DataFrame(Counter(words).most_common(20))
return most_common_df
def emoji_helper(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
emojis = [c for message in df['message'] for c in message if c in emoji.EMOJI_DATA]
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
return emoji_df
def monthly_timeline(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = [f"{timeline['month'][i]}-{timeline['year'][i]}" for i in range(timeline.shape[0])]
timeline['time'] = time
return timeline
def daily_timeline(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
daily_timeline = df.groupby('only_date').count()['message'].reset_index()
return daily_timeline
def week_activity_map(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
return df['day_name'].value_counts()
def month_activity_map(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
return df['month'].value_counts()
def activity_heatmap(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
return user_heatmap
def words_per_user_per_month(df):
words_per_month = df.groupby(['user', 'year', 'month_num'])['message'].apply(lambda x: ' '.join(x)).reset_index()
words_per_month['word_count'] = words_per_month['message'].apply(lambda x: len(x.split()))
words_per_month_df = words_per_month.pivot(index=['year', 'month_num'], columns='user', values='word_count').fillna(0).astype(int)
return words_per_month_df
def frequent_hours(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
frequent_hours_df = df['hour'].value_counts().sort_index()
return frequent_hours_df
def common_words_by_four_hours(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
common_words_by_hour = {}
for hour in range(0, 24, 4):
period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)]
words = [word.lower() for message in period['message'] for word in message.split()]
common_words_by_hour[f"{hour}-{hour + 4}"] = Counter(words).most_common(10)
common_words_by_hour_df = pd.DataFrame.from_dict(common_words_by_hour, orient='index').fillna('').astype(str)
return common_words_by_hour_df
def create_wordcloud_by_four_hours(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
wordclouds = {}
for hour in range(0, 24, 4):
period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)]
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
wc_img = wc.generate(period['message'].str.cat(sep=" "))
wordclouds[f"{hour}-{hour + 4}"] = wc_img
return wordclouds
def common_words_by_month(selected_user, df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
common_words_by_month = {}
for month in df['month_num'].unique():
monthly_messages = temp[temp['month_num'] == month]
words = [word.lower() for message in monthly_messages['message'] for word in message.split()]
common_words_by_month[month] = Counter(words).most_common(10)
common_words_by_month_df = pd.DataFrame.from_dict(common_words_by_month, orient='index').fillna('').astype(str)
return common_words_by_month_df