Soufianesejjari commited on
Commit
4f3e60d
1 Parent(s): ac5445a
Files changed (4) hide show
  1. app.py +156 -0
  2. helper.py +146 -0
  3. preprocessor.py +51 -0
  4. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import preprocessor
3
+ import helper
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import pandas as pd
7
+
8
+ st.set_option('deprecation.showPyplotGlobalUse', False)
9
+
10
+ st.sidebar.title("WhatsApp Chat Analyzer")
11
+
12
+ uploaded_file = st.sidebar.file_uploader("Choose a file")
13
+ if uploaded_file is not None:
14
+ bytes_data = uploaded_file.getvalue()
15
+ data = bytes_data.decode("utf-8")
16
+ df = preprocessor.preprocess(data)
17
+
18
+ # fetch unique users
19
+ user_list = df['user'].unique().tolist()
20
+ if 'group_notification' in user_list:
21
+ user_list.remove('group_notification')
22
+ user_list.sort()
23
+ user_list.insert(0, "Overall")
24
+
25
+ selected_user = st.sidebar.selectbox("Show analysis wrt", user_list)
26
+
27
+ if st.sidebar.button("Show Analysis"):
28
+
29
+ # Stats Area
30
+ num_messages, words, num_media_messages, num_links = helper.fetch_stats(selected_user, df)
31
+ st.title("Top Statistics")
32
+ col1, col2, col3, col4 = st.columns(4)
33
+
34
+ with col1:
35
+ st.header("Total Messages")
36
+ st.title(num_messages)
37
+ with col2:
38
+ st.header("Total Words")
39
+ st.title(words)
40
+ with col3:
41
+ st.header("Media Shared")
42
+ st.title(num_media_messages)
43
+ with col4:
44
+ st.header("Links Shared")
45
+ st.title(num_links)
46
+
47
+ # Monthly Timeline
48
+ st.title("Monthly Timeline")
49
+ timeline = helper.monthly_timeline(selected_user, df)
50
+ fig, ax = plt.subplots()
51
+ ax.plot(timeline['time'], timeline['message'], color='green')
52
+ plt.xticks(rotation='vertical')
53
+ st.pyplot(fig)
54
+
55
+ # Daily Timeline
56
+ st.title("Daily Timeline")
57
+ daily_timeline = helper.daily_timeline(selected_user, df)
58
+ fig, ax = plt.subplots()
59
+ ax.plot(daily_timeline['only_date'], daily_timeline['message'], color='black')
60
+ plt.xticks(rotation='vertical')
61
+ st.pyplot(fig)
62
+
63
+ # Activity Map
64
+ st.title('Activity Map')
65
+ col1, col2 = st.columns(2)
66
+
67
+ with col1:
68
+ st.header("Most busy day")
69
+ busy_day = helper.week_activity_map(selected_user, df)
70
+ fig, ax = plt.subplots()
71
+ ax.bar(busy_day.index, busy_day.values, color='purple')
72
+ plt.xticks(rotation='vertical')
73
+ st.pyplot(fig)
74
+
75
+ with col2:
76
+ st.header("Most busy month")
77
+ busy_month = helper.month_activity_map(selected_user, df)
78
+ fig, ax = plt.subplots()
79
+ ax.bar(busy_month.index, busy_month.values, color='orange')
80
+ plt.xticks(rotation='vertical')
81
+ st.pyplot(fig)
82
+
83
+ # Weekly Activity Map
84
+ st.title("Weekly Activity Map")
85
+ user_heatmap = helper.activity_heatmap(selected_user, df)
86
+ fig, ax = plt.subplots()
87
+ ax = sns.heatmap(user_heatmap, annot=True, fmt="g", cmap="YlGnBu", cbar=False)
88
+ st.pyplot(fig)
89
+
90
+ # Most Busy Users (Overall)
91
+ if selected_user == 'Overall':
92
+ st.title('Most Busy Users')
93
+ x, new_df = helper.most_busy_users(df)
94
+ fig, ax = plt.subplots(figsize=(8, 6))
95
+
96
+ col1, col2 = st.columns(2)
97
+
98
+ with col1:
99
+ ax.bar(x.index, x.values, color='red')
100
+ plt.xticks(rotation='vertical')
101
+ st.pyplot(fig)
102
+ with col2:
103
+ st.dataframe(new_df)
104
+
105
+ # WordCloud
106
+ st.title("Wordcloud")
107
+ df_wc = helper.create_wordcloud(selected_user, df)
108
+ fig, ax = plt.subplots()
109
+ ax.imshow(df_wc)
110
+ st.pyplot(fig)
111
+
112
+ # Most Common Words
113
+ st.title('Most Common Words')
114
+ most_common_df = helper.most_common_words(selected_user, df)
115
+ fig, ax = plt.subplots()
116
+ ax.barh(most_common_df[0], most_common_df[1])
117
+ plt.xticks(rotation='vertical')
118
+ st.pyplot(fig)
119
+
120
+ # Emoji Analysis
121
+ st.title("Emoji Analysis")
122
+ emoji_df = helper.emoji_helper(selected_user, df)
123
+ col1, col2 = st.columns(2)
124
+ with col1:
125
+ st.dataframe(emoji_df)
126
+ with col2:
127
+ fig, ax = plt.subplots()
128
+ ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), autopct="%0.2f")
129
+ st.pyplot(fig)
130
+
131
+ # Words per User per Month
132
+ st.title("Words per User per Month")
133
+ words_per_month_df = helper.words_per_user_per_month(df)
134
+ st.dataframe(words_per_month_df)
135
+
136
+ # Frequent Hours
137
+ st.title("Frequent Hours")
138
+ frequent_hours_df = helper.frequent_hours(selected_user, df)
139
+ st.bar_chart(frequent_hours_df)
140
+
141
+ # Common Words by 4-Hour Intervals
142
+ st.title("Common Words by 4-Hour Intervals")
143
+ common_words_by_hour_df = helper.common_words_by_four_hours(selected_user, df)
144
+ st.dataframe(common_words_by_hour_df)
145
+
146
+ # WordClouds by 4-Hour Intervals
147
+ st.title("WordClouds by 4-Hour Intervals")
148
+ wordclouds_by_hour = helper.create_wordcloud_by_four_hours(selected_user, df)
149
+ for period, wc_img in wordclouds_by_hour.items():
150
+ st.subheader(f"WordCloud for {period}")
151
+ st.image(wc_img.to_array(), use_column_width=True)
152
+
153
+ # Common Words by Month
154
+ st.title("Common Words by Month")
155
+ common_words_by_month_df = helper.common_words_by_month(selected_user, df)
156
+ st.dataframe(common_words_by_month_df)
helper.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urlextract import URLExtract
2
+ from wordcloud import WordCloud
3
+ import pandas as pd
4
+ from collections import Counter
5
+ import emoji
6
+
7
+ extract = URLExtract()
8
+
9
+ def fetch_stats(selected_user, df):
10
+ if selected_user != 'Overall':
11
+ df = df[df['user'] == selected_user]
12
+
13
+ num_messages = df.shape[0]
14
+ words = [word for message in df['message'] for word in message.split()]
15
+ num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]
16
+ links = [url for message in df['message'] for url in extract.find_urls(message)]
17
+
18
+ return num_messages, len(words), num_media_messages, len(links)
19
+
20
+ def most_busy_users(df):
21
+ x = df['user'].value_counts().head()
22
+ percent_df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(columns={'index': 'name', 'user': 'percent'})
23
+ return x, percent_df
24
+
25
+ def create_wordcloud(selected_user, df):
26
+ if selected_user != 'Overall':
27
+ df = df[df['user'] == selected_user]
28
+
29
+ temp = df[df['user'] != 'group_notification']
30
+ temp = temp[temp['message'] != '<Media omitted>\n']
31
+
32
+ wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
33
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
34
+ return df_wc
35
+
36
+ def most_common_words(selected_user, df):
37
+ if selected_user != 'Overall':
38
+ df = df[df['user'] == selected_user]
39
+
40
+ temp = df[df['user'] != 'group_notification']
41
+ temp = temp[temp['message'] != '<Media omitted>\n']
42
+
43
+ words = [word.lower() for message in temp['message'] for word in message.split()]
44
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
45
+ return most_common_df
46
+
47
+ def emoji_helper(selected_user, df):
48
+ if selected_user != 'Overall':
49
+ df = df[df['user'] == selected_user]
50
+
51
+ emojis = [c for message in df['message'] for c in message if c in emoji.EMOJI_DATA]
52
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
53
+ return emoji_df
54
+
55
+ def monthly_timeline(selected_user, df):
56
+ if selected_user != 'Overall':
57
+ df = df[df['user'] == selected_user]
58
+
59
+ timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
60
+
61
+ time = [f"{timeline['month'][i]}-{timeline['year'][i]}" for i in range(timeline.shape[0])]
62
+ timeline['time'] = time
63
+ return timeline
64
+
65
+ def daily_timeline(selected_user, df):
66
+ if selected_user != 'Overall':
67
+ df = df[df['user'] == selected_user]
68
+
69
+ daily_timeline = df.groupby('only_date').count()['message'].reset_index()
70
+ return daily_timeline
71
+
72
+ def week_activity_map(selected_user, df):
73
+ if selected_user != 'Overall':
74
+ df = df[df['user'] == selected_user]
75
+ return df['day_name'].value_counts()
76
+
77
+ def month_activity_map(selected_user, df):
78
+ if selected_user != 'Overall':
79
+ df = df[df['user'] == selected_user]
80
+ return df['month'].value_counts()
81
+
82
+ def activity_heatmap(selected_user, df):
83
+ if selected_user != 'Overall':
84
+ df = df[df['user'] == selected_user]
85
+ user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
86
+ return user_heatmap
87
+
88
+ def words_per_user_per_month(df):
89
+ words_per_month = df.groupby(['user', 'year', 'month_num'])['message'].apply(lambda x: ' '.join(x)).reset_index()
90
+ words_per_month['word_count'] = words_per_month['message'].apply(lambda x: len(x.split()))
91
+ words_per_month_df = words_per_month.pivot(index=['year', 'month_num'], columns='user', values='word_count').fillna(0).astype(int)
92
+ return words_per_month_df
93
+
94
+ def frequent_hours(selected_user, df):
95
+ if selected_user != 'Overall':
96
+ df = df[df['user'] == selected_user]
97
+ frequent_hours_df = df['hour'].value_counts().sort_index()
98
+ return frequent_hours_df
99
+
100
+ def common_words_by_four_hours(selected_user, df):
101
+ if selected_user != 'Overall':
102
+ df = df[df['user'] == selected_user]
103
+
104
+ temp = df[df['user'] != 'group_notification']
105
+ temp = temp[temp['message'] != '<Media omitted>\n']
106
+
107
+ common_words_by_hour = {}
108
+ for hour in range(0, 24, 4):
109
+ period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)]
110
+ words = [word.lower() for message in period['message'] for word in message.split()]
111
+ common_words_by_hour[f"{hour}-{hour + 4}"] = Counter(words).most_common(10)
112
+
113
+ common_words_by_hour_df = pd.DataFrame.from_dict(common_words_by_hour, orient='index').fillna('').astype(str)
114
+ return common_words_by_hour_df
115
+
116
+ def create_wordcloud_by_four_hours(selected_user, df):
117
+ if selected_user != 'Overall':
118
+ df = df[df['user'] == selected_user]
119
+
120
+ temp = df[df['user'] != 'group_notification']
121
+ temp = temp[temp['message'] != '<Media omitted>\n']
122
+
123
+ wordclouds = {}
124
+ for hour in range(0, 24, 4):
125
+ period = temp[(temp['hour'] >= hour) & (temp['hour'] < hour + 4)]
126
+ wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
127
+ wc_img = wc.generate(period['message'].str.cat(sep=" "))
128
+ wordclouds[f"{hour}-{hour + 4}"] = wc_img
129
+
130
+ return wordclouds
131
+
132
+ def common_words_by_month(selected_user, df):
133
+ if selected_user != 'Overall':
134
+ df = df[df['user'] == selected_user]
135
+
136
+ temp = df[df['user'] != 'group_notification']
137
+ temp = temp[temp['message'] != '<Media omitted>\n']
138
+
139
+ common_words_by_month = {}
140
+ for month in df['month_num'].unique():
141
+ monthly_messages = temp[temp['month_num'] == month]
142
+ words = [word.lower() for message in monthly_messages['message'] for word in message.split()]
143
+ common_words_by_month[month] = Counter(words).most_common(10)
144
+
145
+ common_words_by_month_df = pd.DataFrame.from_dict(common_words_by_month, orient='index').fillna('').astype(str)
146
+ return common_words_by_month_df
preprocessor.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def preprocess(data):
5
+ pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
6
+
7
+ messages = re.split(pattern, data)[1:]
8
+ dates = re.findall(pattern, data)
9
+
10
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
11
+ # convert message_date type
12
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')
13
+
14
+ df.rename(columns={'message_date': 'date'}, inplace=True)
15
+
16
+ users = []
17
+ messages = []
18
+ for message in df['user_message']:
19
+ entry = re.split('([\w\W]+?):\s', message)
20
+ if entry[1:]: # user name
21
+ users.append(entry[1])
22
+ messages.append(" ".join(entry[2:]))
23
+ else:
24
+ users.append('group_notification')
25
+ messages.append(entry[0])
26
+
27
+ df['user'] = users
28
+ df['message'] = messages
29
+ df.drop(columns=['user_message'], inplace=True)
30
+
31
+ df['only_date'] = df['date'].dt.date
32
+ df['year'] = df['date'].dt.year
33
+ df['month_num'] = df['date'].dt.month
34
+ df['month'] = df['date'].dt.month_name()
35
+ df['day'] = df['date'].dt.day
36
+ df['day_name'] = df['date'].dt.day_name()
37
+ df['hour'] = df['date'].dt.hour
38
+ df['minute'] = df['date'].dt.minute
39
+
40
+ period = []
41
+ for hour in df[['day_name', 'hour']]['hour']:
42
+ if hour == 23:
43
+ period.append(str(hour) + "-" + str('00'))
44
+ elif hour == 0:
45
+ period.append(str('00') + "-" + str(hour + 1))
46
+ else:
47
+ period.append(str(hour) + "-" + str(hour + 1))
48
+
49
+ df['period'] = period
50
+
51
+ return df
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ matplotlib
3
+ seaborn
4
+ urlextract
5
+ wordcloud
6
+ pandas
7
+ emoji