Jan Winkler commited on
Commit
719aa88
1 Parent(s): 2a66be9

delete redundant code, update paths and restructuring (#6)

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. python/notebooks/eda_jan.py +59 -171
README.md CHANGED
@@ -64,6 +64,6 @@ now you shoold see the running docker containers.
64
  1. `chmod +x format` so that the `format` file is executable
65
  2. then simply use `./format` before adding your changes and all the files will be autoformatted
66
 
67
- ## awesome data overview
68
 
69
  ![](./docs/data_overview_yuri.jpg)
 
64
  1. `chmod +x format` so that the `format` file is executable
65
  2. then simply use `./format` before adding your changes and all the files will be autoformatted
66
 
67
+ ## awesome data overview
68
 
69
  ![](./docs/data_overview_yuri.jpg)
python/notebooks/eda_jan.py CHANGED
@@ -12,114 +12,6 @@
12
  # name: python3
13
  # ---
14
 
15
- # %%
16
- import os
17
- import numpy as np
18
- import librosa
19
- import librosa.display
20
- import matplotlib.pyplot as plt
21
- from sklearn.cluster import KMeans
22
- from sklearn.decomposition import PCA
23
- from IPython.display import Audio, display
24
-
25
- # %%
26
- # Load the entire audio file
27
- cwd = os.getcwd()
28
- relative_path = "data/soundscape_data/PER_001_S01_20190116_100007Z.flac"
29
- file_path = os.path.join(cwd, relative_path)
30
- y, sr = librosa.load(file_path, sr=44100)
31
-
32
- # %%
33
- # split soundfile in to 10s chunks
34
- window_size = 10 # window size in seconds
35
- hop_size = 10 # hop size in seconds
36
-
37
- # Convert window and hop size to samples
38
- window_samples = int(window_size * sr)
39
- hop_samples = int(hop_size * sr)
40
-
41
- # Total number of windows
42
- num_windows = (len(y) - window_samples) // hop_samples + 1
43
-
44
- print(f"Total number of windows: {num_windows}")
45
-
46
-
47
- # %%
48
- # Define frequency bands (in Hz)
49
- bands = {
50
- "Sub-bass": (20, 60),
51
- "Bass": (60, 250),
52
- "Low Midrange": (250, 500),
53
- "Midrange": (500, 2000),
54
- "Upper Midrange": (2000, 4000),
55
- "Presence": (4000, 6000),
56
- "Brilliance": (6000, 20000),
57
- }
58
-
59
- # Initialize a list to hold the features
60
- all_features = []
61
-
62
- for i in range(num_windows):
63
- start_sample = i * hop_samples
64
- end_sample = start_sample + window_samples
65
- y_window = y[start_sample:end_sample]
66
-
67
- # Compute STFT
68
- S = librosa.stft(y_window)
69
- S_db = librosa.amplitude_to_db(np.abs(S))
70
-
71
- # Compute features for each band
72
- features = []
73
- for band, (low_freq, high_freq) in bands.items():
74
- low_bin = int(np.floor(low_freq * (S.shape[0] / sr)))
75
- high_bin = int(np.ceil(high_freq * (S.shape[0] / sr)))
76
- band_energy = np.mean(S_db[low_bin:high_bin, :], axis=0)
77
- features.append(band_energy)
78
-
79
- # Flatten the feature array and add to all_features
80
- features_flat = np.concatenate(features)
81
- all_features.append(features_flat)
82
-
83
- # Convert to numpy array
84
- all_features = np.array(all_features)
85
-
86
-
87
- # %%
88
- # Reduce dimensionality with PCA
89
- pca = PCA(n_components=2)
90
- features_reduced = pca.fit_transform(all_features)
91
-
92
- # Perform k-means clustering
93
- kmeans = KMeans(n_clusters=5) # Example: 5 clusters
94
- clusters = kmeans.fit_predict(features_reduced)
95
-
96
- # Plot the clusters
97
- plt.figure(figsize=(10, 6))
98
- scatter = plt.scatter(
99
- features_reduced[:, 0], features_reduced[:, 1], c=clusters, cmap="viridis"
100
- )
101
- plt.title("Clustered Frequency Band Features")
102
- plt.xlabel("Principal Component 1")
103
- plt.ylabel("Principal Component 2")
104
- plt.colorbar(scatter, label="Cluster")
105
- plt.show()
106
-
107
-
108
- # %%
109
- # Play the audio for a representative sample from each cluster
110
- for cluster_label in np.unique(clusters):
111
- # Find the first data point in the cluster
112
- representative_index = np.where(clusters == cluster_label)[0][0]
113
-
114
- # Use the original audio window at the representative index
115
- start_sample = representative_index * hop_samples
116
- end_sample = start_sample + window_samples
117
- y_representative = y[start_sample:end_sample]
118
-
119
- print(f"Cluster {cluster_label} representative audio:")
120
- display(Audio(data=y_representative, rate=sr))
121
-
122
-
123
  # %% [markdown]
124
  # ## pipeline for all the files
125
 
@@ -140,12 +32,22 @@ from sklearn.ensemble import RandomForestClassifier
140
 
141
 
142
  # %%
 
143
  # Directory containing the audio files
144
  # audio_dir = "data/soundscape_data"
145
  audio_dir = (
146
- "data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
147
  )
148
 
 
 
 
 
 
 
 
 
 
149
  # Parameters for windowing
150
  window_size = 10 # window size in seconds
151
  hop_size = 10 # hop size in seconds
@@ -161,15 +63,11 @@ bands = {
161
  "Brilliance": (6000, 20000),
162
  }
163
 
164
- # Directory to save features
165
- features_dir = "features"
166
- os.makedirs(features_dir, exist_ok=True)
167
-
168
  # Iterate over each audio file in the directory
169
  for filename in os.listdir(audio_dir):
170
  if filename.endswith(".wav"):
171
  file_path = os.path.join(audio_dir, filename)
172
- y, sr = librosa.load(file_path, sr=44100)
173
 
174
  # Convert window and hop size to samples
175
  window_samples = int(window_size * sr)
@@ -216,11 +114,9 @@ for filename in os.listdir(audio_dir):
216
 
217
 
218
  # %%
219
- # Directory to load features
220
- features_dir = "features"
221
  n_clusters = 5
222
 
223
-
224
  # Load all features
225
  all_features = []
226
  for feature_file in os.listdir(features_dir):
@@ -239,21 +135,36 @@ features_pca = pca.fit_transform(all_features)
239
  kmeans = KMeans(n_clusters=n_clusters) # Example: 5 clusters
240
  clusters = kmeans.fit_predict(all_features)
241
 
242
- # Plot the PCA-reduced features with cluster labels
 
 
 
 
 
 
243
  plt.figure(figsize=(10, 6))
244
- scatter = plt.scatter(
245
- features_pca[:, 0], features_pca[:, 1], c=clusters, cmap="viridis"
246
- )
 
 
 
 
 
 
 
 
 
 
 
247
  plt.title("PCA of Clustered Frequency Band Features")
248
  plt.xlabel("Principal Component 1")
249
  plt.ylabel("Principal Component 2")
250
- plt.colorbar(scatter, label="Cluster")
251
  plt.show()
252
 
253
- # Save clustering results
254
- clustering_results = {"clusters": clusters, "kmeans": kmeans, "pca": pca}
255
- joblib.dump(clustering_results, "clustering_results.pkl")
256
 
 
257
  # Plot the clusters
258
  plt.figure(figsize=(10, 6))
259
  for i in range(n_clusters):
@@ -264,21 +175,26 @@ plt.xlabel("Feature Index (Frequency Bands)")
264
  plt.ylabel("Mean Feature Value (Energy in dB)")
265
  plt.show()
266
 
 
267
  # %%
268
- # Directory containing the audio files
269
- # audio_dir = "data/soundscape_data"
270
- audio_dir = (
271
- "data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
272
- )
273
- # Directory to load features
274
- features_dir = "features"
 
 
 
275
 
 
276
  # Parameters for windowing
277
  window_size = 10 # window size in seconds
278
  hop_size = 10 # hop size in seconds
279
 
280
  # Load clustering results
281
- clustering_results = joblib.load("clustering_results.pkl")
282
  clusters = clustering_results["clusters"]
283
 
284
  # Load all features
@@ -290,7 +206,7 @@ for feature_file in os.listdir(features_dir):
290
  features, scaler = joblib.load(os.path.join(features_dir, feature_file))
291
  filename = feature_file.replace("_features.npy", ".wav")
292
  file_path = os.path.join(audio_dir, filename)
293
- y, sr = librosa.load(file_path, sr=44100)
294
 
295
  # Convert window and hop size to samples
296
  window_samples = int(window_size * sr)
@@ -307,7 +223,8 @@ for feature_file in os.listdir(features_dir):
307
  # Flatten the list of all features
308
  all_features = np.vstack(all_features)
309
 
310
- # Play the audio for a representative sample from each cluster
 
311
  for cluster_label in np.unique(clusters):
312
  try:
313
  # Find the first data point in the cluster
@@ -323,12 +240,17 @@ for cluster_label in np.unique(clusters):
323
  print(f"Cluster {cluster_label} representative audio:")
324
  display(Audio(data=y_representative, rate=sr))
325
 
 
 
 
 
 
326
  except Exception as e:
327
  print(f"Could not play audio for cluster {cluster_label}: {e}")
328
 
329
-
330
  # %%
331
-
 
332
  # Fit PCA
333
  pca = PCA().fit(all_features_scaled)
334
 
@@ -367,40 +289,6 @@ kaiser_criterion = np.sum(eigenvalues > 1)
367
  print(f"Number of components selected by Kaiser Criterion: {kaiser_criterion}")
368
 
369
 
370
- # %%
371
- # Method 4: Cross-Validation
372
- # Evaluate a classifier with different numbers of principal components
373
-
374
- ## do not run if you dont have time, this takes forever.
375
- # scores = []
376
- # for n_components in range(1, len(explained_variance) + 1):
377
- # pca = PCA(n_components=n_components)
378
- # features_pca = pca.fit_transform(all_features_scaled)
379
- # classifier = RandomForestClassifier() # Use your preferred model here
380
- # score = np.mean(cross_val_score(classifier, features_pca, clusters, cv=n_clusters)) # Assuming `clusters` are your labels
381
- # scores.append(score)
382
-
383
- # # Plot cross-validation scores
384
- # plt.figure(figsize=(10, 6))
385
- # plt.plot(range(1, len(explained_variance) + 1), scores, marker='o')
386
- # plt.xlabel('Number of Principal Components')
387
- # plt.ylabel('Cross-Validation Score')
388
- # plt.title('Cross-Validation Score vs. Number of Principal Components')
389
- # plt.grid(True)
390
- # plt.show()
391
-
392
- # # Choosing the number of components that explain at least 95% of the variance
393
- # n_components_variance = np.argmax(cumulative_explained_variance >= 0.95) + 1
394
- # print(f"Number of components to retain 95% variance: {n_components_variance}")
395
-
396
- # # Choose the optimal number of components based on your analysis
397
- # optimal_n_components = n_components_variance # or based on the scree plot, cross-validation, etc.
398
- # print(f"Optimal number of components: {optimal_n_components}")
399
-
400
- # # Perform PCA with the selected number of components
401
- # pca = PCA(n_components=optimal_n_components)
402
- # features_pca = pca.fit_transform(all_features_scaled)
403
-
404
  # %%
405
 
406
  # %%
 
12
  # name: python3
13
  # ---
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # %% [markdown]
16
  # ## pipeline for all the files
17
 
 
32
 
33
 
34
  # %%
35
+ ## cockpit for directories
36
  # Directory containing the audio files
37
  # audio_dir = "data/soundscape_data"
38
  audio_dir = (
39
+ "../data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
40
  )
41
 
42
+ # Directory to save features
43
+ features_dir = "../data/features"
44
+ os.makedirs(features_dir, exist_ok=True)
45
+
46
+ # Directory to save clusters information
47
+ clusters_dir = "../data/clusters"
48
+ os.makedirs(clusters_dir, exist_ok=True)
49
+
50
+ # %%
51
  # Parameters for windowing
52
  window_size = 10 # window size in seconds
53
  hop_size = 10 # hop size in seconds
 
63
  "Brilliance": (6000, 20000),
64
  }
65
 
 
 
 
 
66
  # Iterate over each audio file in the directory
67
  for filename in os.listdir(audio_dir):
68
  if filename.endswith(".wav"):
69
  file_path = os.path.join(audio_dir, filename)
70
+ y, sr = librosa.load(file_path, sr=None)
71
 
72
  # Convert window and hop size to samples
73
  window_samples = int(window_size * sr)
 
114
 
115
 
116
  # %%
117
+ # Number of clusters
 
118
  n_clusters = 5
119
 
 
120
  # Load all features
121
  all_features = []
122
  for feature_file in os.listdir(features_dir):
 
135
  kmeans = KMeans(n_clusters=n_clusters) # Example: 5 clusters
136
  clusters = kmeans.fit_predict(all_features)
137
 
138
+ # Save clustering results
139
+ clustering_results = {"clusters": clusters, "kmeans": kmeans, "pca": pca}
140
+ joblib.dump(clustering_results, os.path.join(clusters_dir, "clustering_results.pkl"))
141
+
142
+
143
+ # %%
144
+ # Plot the PCA-reduced features with cluster labels using a legend
145
  plt.figure(figsize=(10, 6))
146
+
147
+ # Define a colormap
148
+ colors = plt.cm.tab10(np.arange(kmeans.n_clusters))
149
+
150
+ for cluster_label in np.unique(clusters):
151
+ cluster_points = features_pca[clusters == cluster_label]
152
+ plt.scatter(
153
+ cluster_points[:, 0],
154
+ cluster_points[:, 1],
155
+ s=50,
156
+ color=colors[cluster_label],
157
+ label=f"Cluster {cluster_label}",
158
+ )
159
+
160
  plt.title("PCA of Clustered Frequency Band Features")
161
  plt.xlabel("Principal Component 1")
162
  plt.ylabel("Principal Component 2")
163
+ plt.legend()
164
  plt.show()
165
 
 
 
 
166
 
167
+ # %%
168
  # Plot the clusters
169
  plt.figure(figsize=(10, 6))
170
  for i in range(n_clusters):
 
175
  plt.ylabel("Mean Feature Value (Energy in dB)")
176
  plt.show()
177
 
178
+
179
  # %%
180
+ # Function to plot the spectrogram
181
+ def plot_spectrogram(y, sr, title):
182
+ S = librosa.stft(y)
183
+ S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
184
+ plt.figure(figsize=(10, 6))
185
+ librosa.display.specshow(S_db, sr=sr, x_axis="time", y_axis="log")
186
+ plt.colorbar(format="%+2.0f dB")
187
+ plt.title(title)
188
+ plt.show()
189
+
190
 
191
+ # %%
192
  # Parameters for windowing
193
  window_size = 10 # window size in seconds
194
  hop_size = 10 # hop size in seconds
195
 
196
  # Load clustering results
197
+ clustering_results = joblib.load(os.path.join(clusters_dir, "clustering_results.pkl"))
198
  clusters = clustering_results["clusters"]
199
 
200
  # Load all features
 
206
  features, scaler = joblib.load(os.path.join(features_dir, feature_file))
207
  filename = feature_file.replace("_features.npy", ".wav")
208
  file_path = os.path.join(audio_dir, filename)
209
+ y, sr = librosa.load(file_path, sr=None)
210
 
211
  # Convert window and hop size to samples
212
  window_samples = int(window_size * sr)
 
223
  # Flatten the list of all features
224
  all_features = np.vstack(all_features)
225
 
226
+
227
+ # %%
228
  for cluster_label in np.unique(clusters):
229
  try:
230
  # Find the first data point in the cluster
 
240
  print(f"Cluster {cluster_label} representative audio:")
241
  display(Audio(data=y_representative, rate=sr))
242
 
243
+ # Plot the spectrogram
244
+ plot_spectrogram(
245
+ y_representative, sr, f"Spectrogram for Cluster {cluster_label}"
246
+ )
247
+
248
  except Exception as e:
249
  print(f"Could not play audio for cluster {cluster_label}: {e}")
250
 
 
251
  # %%
252
+ scaler = StandardScaler()
253
+ all_features_scaled = scaler.fit_transform(all_features)
254
  # Fit PCA
255
  pca = PCA().fit(all_features_scaled)
256
 
 
289
  print(f"Number of components selected by Kaiser Criterion: {kaiser_criterion}")
290
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  # %%
293
 
294
  # %%