from urlextract import URLExtract from wordcloud import WordCloud import pandas as pd from collections import Counter import emoji extract = URLExtract() def fetch_stats(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] num_messages = df.shape[0] words = [word for message in df['message'] for word in message.split()] num_media_messages = df[df['message'] == '\n'].shape[0] links = [url for message in df['message'] for url in extract.find_urls(message)] return num_messages, len(words), num_media_messages, len(links) def most_busy_users(df): x = df['user'].value_counts().head() percent_df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(columns={'index': 'name', 'user': 'percent'}) return x, percent_df def create_wordcloud(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') df_wc = wc.generate(temp['message'].str.cat(sep=" ")) return df_wc def most_common_words(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] words = [word.lower() for message in temp['message'] for word in message.split()] most_common_df = pd.DataFrame(Counter(words).most_common(20)) return most_common_df def emoji_helper(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] emojis = [c for message in df['message'] for c in message if c in emoji.EMOJI_DATA] emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) return emoji_df def monthly_timeline(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index() time = [f"{timeline['month'][i]}-{timeline['year'][i]}" for i in range(timeline.shape[0])] timeline['time'] = time return timeline def daily_timeline(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] daily_timeline = df.groupby('only_date').count()['message'].reset_index() return daily_timeline def week_activity_map(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] return df['day_name'].value_counts() def month_activity_map(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] return df['month'].value_counts() def activity_heatmap(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0) return user_heatmap def words_per_user_per_month(df): words_per_month = df.groupby(['user', 'year', 'month_num'])['message'].apply(lambda x: ' '.join(x)).reset_index() words_per_month['word_count'] = words_per_month['message'].apply(lambda x: len(x.split())) words_per_month_df = words_per_month.pivot(index=['year', 'month_num'], columns='user', values='word_count').fillna(0).astype(int) return words_per_month_df def frequent_hours(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] frequent_hours_df = df['hour'].value_counts().sort_index() return frequent_hours_df def common_words_by_four_hours(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] common_words_by_hour = {} for hour in range(0, 24, 4): period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)] words = [word.lower() for message in period['message'] for word in message.split()] common_words_by_hour[f"{hour}-{hour + 4}"] = Counter(words).most_common(10) common_words_by_hour_df = pd.DataFrame.from_dict(common_words_by_hour, orient='index').fillna('').astype(str) return common_words_by_hour_df def create_wordcloud_by_four_hours(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] wordclouds = {} for hour in range(0, 24, 4): period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)] wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') wc_img = wc.generate(period['message'].str.cat(sep=" ")) wordclouds[f"{hour}-{hour + 4}"] = wc_img return wordclouds def common_words_by_month(selected_user, df): if selected_user != 'Overall': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] common_words_by_month = {} for month in df['month_num'].unique(): monthly_messages = temp[temp['month_num'] == month] words = [word.lower() for message in monthly_messages['message'] for word in message.split()] common_words_by_month[month] = Counter(words).most_common(10) common_words_by_month_df = pd.DataFrame.from_dict(common_words_by_month, orient='index').fillna('').astype(str) return common_words_by_month_df