Spaces:

wwebec
/

empty-string

Sleeping

App Files Files Community

Jan Winkler commited on May 20

Commit

719aa88

•

1 Parent(s): 2a66be9

delete redundant code, update paths and restructuring (#6)

Browse files

Files changed (2) hide show

README.md +1 -1
python/notebooks/eda_jan.py +59 -171

README.md CHANGED Viewed

@@ -64,6 +64,6 @@ now you shoold see the running docker containers.
   1.  `chmod +x format` so that the `format` file is executable
   2.  then simply use `./format` before adding your changes and all the files will be autoformatted
-  ## awesome data overview
   ![](./docs/data_overview_yuri.jpg)

   1.  `chmod +x format` so that the `format` file is executable
   2.  then simply use `./format` before adding your changes and all the files will be autoformatted
+## awesome data overview
   ![](./docs/data_overview_yuri.jpg)

python/notebooks/eda_jan.py CHANGED Viewed

@@ -12,114 +12,6 @@
 #     name: python3
 # ---
-# %%
-import os
-import numpy as np
-import librosa
-import librosa.display
-import matplotlib.pyplot as plt
-from sklearn.cluster import KMeans
-from sklearn.decomposition import PCA
-from IPython.display import Audio, display
-# %%
-# Load the entire audio file
-cwd = os.getcwd()
-relative_path = "data/soundscape_data/PER_001_S01_20190116_100007Z.flac"
-file_path = os.path.join(cwd, relative_path)
-y, sr = librosa.load(file_path, sr=44100)
-# %%
-# split soundfile in to 10s chunks
-window_size = 10  # window size in seconds
-hop_size = 10  # hop size in seconds
-# Convert window and hop size to samples
-window_samples = int(window_size * sr)
-hop_samples = int(hop_size * sr)
-# Total number of windows
-num_windows = (len(y) - window_samples) // hop_samples + 1
-print(f"Total number of windows: {num_windows}")
-# %%
-# Define frequency bands (in Hz)
-bands = {
-    "Sub-bass": (20, 60),
-    "Bass": (60, 250),
-    "Low Midrange": (250, 500),
-    "Midrange": (500, 2000),
-    "Upper Midrange": (2000, 4000),
-    "Presence": (4000, 6000),
-    "Brilliance": (6000, 20000),
-}
-# Initialize a list to hold the features
-all_features = []
-for i in range(num_windows):
-    start_sample = i * hop_samples
-    end_sample = start_sample + window_samples
-    y_window = y[start_sample:end_sample]
-    # Compute STFT
-    S = librosa.stft(y_window)
-    S_db = librosa.amplitude_to_db(np.abs(S))
-    # Compute features for each band
-    features = []
-    for band, (low_freq, high_freq) in bands.items():
-        low_bin = int(np.floor(low_freq * (S.shape[0] / sr)))
-        high_bin = int(np.ceil(high_freq * (S.shape[0] / sr)))
-        band_energy = np.mean(S_db[low_bin:high_bin, :], axis=0)
-        features.append(band_energy)
-    # Flatten the feature array and add to all_features
-    features_flat = np.concatenate(features)
-    all_features.append(features_flat)
-# Convert to numpy array
-all_features = np.array(all_features)
-# %%
-# Reduce dimensionality with PCA
-pca = PCA(n_components=2)
-features_reduced = pca.fit_transform(all_features)
-# Perform k-means clustering
-kmeans = KMeans(n_clusters=5)  # Example: 5 clusters
-clusters = kmeans.fit_predict(features_reduced)
-# Plot the clusters
-plt.figure(figsize=(10, 6))
-scatter = plt.scatter(
-    features_reduced[:, 0], features_reduced[:, 1], c=clusters, cmap="viridis"
-)
-plt.title("Clustered Frequency Band Features")
-plt.xlabel("Principal Component 1")
-plt.ylabel("Principal Component 2")
-plt.colorbar(scatter, label="Cluster")
-plt.show()
-# %%
-# Play the audio for a representative sample from each cluster
-for cluster_label in np.unique(clusters):
-    # Find the first data point in the cluster
-    representative_index = np.where(clusters == cluster_label)[0][0]
-    # Use the original audio window at the representative index
-    start_sample = representative_index * hop_samples
-    end_sample = start_sample + window_samples
-    y_representative = y[start_sample:end_sample]
-    print(f"Cluster {cluster_label} representative audio:")
-    display(Audio(data=y_representative, rate=sr))
 # %% [markdown]
 # ## pipeline for all the files
@@ -140,12 +32,22 @@ from sklearn.ensemble import RandomForestClassifier
 # %%
 # Directory containing the audio files
 # audio_dir = "data/soundscape_data"
 audio_dir = (
-    "data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
 )
 # Parameters for windowing
 window_size = 10  # window size in seconds
 hop_size = 10  # hop size in seconds
@@ -161,15 +63,11 @@ bands = {
     "Brilliance": (6000, 20000),
 }
-# Directory to save features
-features_dir = "features"
-os.makedirs(features_dir, exist_ok=True)
 # Iterate over each audio file in the directory
 for filename in os.listdir(audio_dir):
     if filename.endswith(".wav"):
         file_path = os.path.join(audio_dir, filename)
-        y, sr = librosa.load(file_path, sr=44100)
         # Convert window and hop size to samples
         window_samples = int(window_size * sr)
@@ -216,11 +114,9 @@ for filename in os.listdir(audio_dir):
 # %%
-# Directory to load features
-features_dir = "features"
 n_clusters = 5
 # Load all features
 all_features = []
 for feature_file in os.listdir(features_dir):
@@ -239,21 +135,36 @@ features_pca = pca.fit_transform(all_features)
 kmeans = KMeans(n_clusters=n_clusters)  # Example: 5 clusters
 clusters = kmeans.fit_predict(all_features)
-# Plot the PCA-reduced features with cluster labels
 plt.figure(figsize=(10, 6))
-scatter = plt.scatter(
-    features_pca[:, 0], features_pca[:, 1], c=clusters, cmap="viridis"
-)
 plt.title("PCA of Clustered Frequency Band Features")
 plt.xlabel("Principal Component 1")
 plt.ylabel("Principal Component 2")
-plt.colorbar(scatter, label="Cluster")
 plt.show()
-# Save clustering results
-clustering_results = {"clusters": clusters, "kmeans": kmeans, "pca": pca}
-joblib.dump(clustering_results, "clustering_results.pkl")
 # Plot the clusters
 plt.figure(figsize=(10, 6))
 for i in range(n_clusters):
@@ -264,21 +175,26 @@ plt.xlabel("Feature Index (Frequency Bands)")
 plt.ylabel("Mean Feature Value (Energy in dB)")
 plt.show()
 # %%
-# Directory containing the audio files
-# audio_dir = "data/soundscape_data"
-audio_dir = (
-    "data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
-)
-# Directory to load features
-features_dir = "features"
 # Parameters for windowing
 window_size = 10  # window size in seconds
 hop_size = 10  # hop size in seconds
 # Load clustering results
-clustering_results = joblib.load("clustering_results.pkl")
 clusters = clustering_results["clusters"]
 # Load all features
@@ -290,7 +206,7 @@ for feature_file in os.listdir(features_dir):
         features, scaler = joblib.load(os.path.join(features_dir, feature_file))
         filename = feature_file.replace("_features.npy", ".wav")
         file_path = os.path.join(audio_dir, filename)
-        y, sr = librosa.load(file_path, sr=44100)
         # Convert window and hop size to samples
         window_samples = int(window_size * sr)
@@ -307,7 +223,8 @@ for feature_file in os.listdir(features_dir):
 # Flatten the list of all features
 all_features = np.vstack(all_features)
-# Play the audio for a representative sample from each cluster
 for cluster_label in np.unique(clusters):
     try:
         # Find the first data point in the cluster
@@ -323,12 +240,17 @@ for cluster_label in np.unique(clusters):
         print(f"Cluster {cluster_label} representative audio:")
         display(Audio(data=y_representative, rate=sr))
     except Exception as e:
         print(f"Could not play audio for cluster {cluster_label}: {e}")
 # %%
 # Fit PCA
 pca = PCA().fit(all_features_scaled)
@@ -367,40 +289,6 @@ kaiser_criterion = np.sum(eigenvalues > 1)
 print(f"Number of components selected by Kaiser Criterion: {kaiser_criterion}")
-# %%
-# Method 4: Cross-Validation
-# Evaluate a classifier with different numbers of principal components
-## do not run if you dont have time, this takes forever.
-# scores = []
-# for n_components in range(1, len(explained_variance) + 1):
-#     pca = PCA(n_components=n_components)
-#     features_pca = pca.fit_transform(all_features_scaled)
-#     classifier = RandomForestClassifier()  # Use your preferred model here
-#     score = np.mean(cross_val_score(classifier, features_pca, clusters, cv=n_clusters))  # Assuming `clusters` are your labels
-#     scores.append(score)
-# # Plot cross-validation scores
-# plt.figure(figsize=(10, 6))
-# plt.plot(range(1, len(explained_variance) + 1), scores, marker='o')
-# plt.xlabel('Number of Principal Components')
-# plt.ylabel('Cross-Validation Score')
-# plt.title('Cross-Validation Score vs. Number of Principal Components')
-# plt.grid(True)
-# plt.show()
-# # Choosing the number of components that explain at least 95% of the variance
-# n_components_variance = np.argmax(cumulative_explained_variance >= 0.95) + 1
-# print(f"Number of components to retain 95% variance: {n_components_variance}")
-# # Choose the optimal number of components based on your analysis
-# optimal_n_components = n_components_variance  # or based on the scree plot, cross-validation, etc.
-# print(f"Optimal number of components: {optimal_n_components}")
-# # Perform PCA with the selected number of components
-# pca = PCA(n_components=optimal_n_components)
-# features_pca = pca.fit_transform(all_features_scaled)
 # %%
 # %%

 #     name: python3
 # ---
 # %% [markdown]
 # ## pipeline for all the files
 # %%
+## cockpit for directories
 # Directory containing the audio files
 # audio_dir = "data/soundscape_data"
 audio_dir = (
+    "../data/SoundMeters_Ingles_Primary-20240519T132658Z-002/SoundMeters_Ingles_Primary"
 )
+# Directory to save features
+features_dir = "../data/features"
+os.makedirs(features_dir, exist_ok=True)
+# Directory to save clusters information
+clusters_dir = "../data/clusters"
+os.makedirs(clusters_dir, exist_ok=True)
+# %%
 # Parameters for windowing
 window_size = 10  # window size in seconds
 hop_size = 10  # hop size in seconds
     "Brilliance": (6000, 20000),
 }
 # Iterate over each audio file in the directory
 for filename in os.listdir(audio_dir):
     if filename.endswith(".wav"):
         file_path = os.path.join(audio_dir, filename)
+        y, sr = librosa.load(file_path, sr=None)
         # Convert window and hop size to samples
         window_samples = int(window_size * sr)
 # %%
+# Number of clusters
 n_clusters = 5
 # Load all features
 all_features = []
 for feature_file in os.listdir(features_dir):
 kmeans = KMeans(n_clusters=n_clusters)  # Example: 5 clusters
 clusters = kmeans.fit_predict(all_features)
+# Save clustering results
+clustering_results = {"clusters": clusters, "kmeans": kmeans, "pca": pca}
+joblib.dump(clustering_results, os.path.join(clusters_dir, "clustering_results.pkl"))
+# %%
+# Plot the PCA-reduced features with cluster labels using a legend
 plt.figure(figsize=(10, 6))
+# Define a colormap
+colors = plt.cm.tab10(np.arange(kmeans.n_clusters))
+for cluster_label in np.unique(clusters):
+    cluster_points = features_pca[clusters == cluster_label]
+    plt.scatter(
+        cluster_points[:, 0],
+        cluster_points[:, 1],
+        s=50,
+        color=colors[cluster_label],
+        label=f"Cluster {cluster_label}",
+    )
 plt.title("PCA of Clustered Frequency Band Features")
 plt.xlabel("Principal Component 1")
 plt.ylabel("Principal Component 2")
+plt.legend()
 plt.show()
+# %%
 # Plot the clusters
 plt.figure(figsize=(10, 6))
 for i in range(n_clusters):
 plt.ylabel("Mean Feature Value (Energy in dB)")
 plt.show()
 # %%
+# Function to plot the spectrogram
+def plot_spectrogram(y, sr, title):
+    S = librosa.stft(y)
+    S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
+    plt.figure(figsize=(10, 6))
+    librosa.display.specshow(S_db, sr=sr, x_axis="time", y_axis="log")
+    plt.colorbar(format="%+2.0f dB")
+    plt.title(title)
+    plt.show()
+# %%
 # Parameters for windowing
 window_size = 10  # window size in seconds
 hop_size = 10  # hop size in seconds
 # Load clustering results
+clustering_results = joblib.load(os.path.join(clusters_dir, "clustering_results.pkl"))
 clusters = clustering_results["clusters"]
 # Load all features
         features, scaler = joblib.load(os.path.join(features_dir, feature_file))
         filename = feature_file.replace("_features.npy", ".wav")
         file_path = os.path.join(audio_dir, filename)
+        y, sr = librosa.load(file_path, sr=None)
         # Convert window and hop size to samples
         window_samples = int(window_size * sr)
 # Flatten the list of all features
 all_features = np.vstack(all_features)
+# %%
 for cluster_label in np.unique(clusters):
     try:
         # Find the first data point in the cluster
         print(f"Cluster {cluster_label} representative audio:")
         display(Audio(data=y_representative, rate=sr))
+        # Plot the spectrogram
+        plot_spectrogram(
+            y_representative, sr, f"Spectrogram for Cluster {cluster_label}"
+        )
     except Exception as e:
         print(f"Could not play audio for cluster {cluster_label}: {e}")
 # %%
+scaler = StandardScaler()
+all_features_scaled = scaler.fit_transform(all_features)
 # Fit PCA
 pca = PCA().fit(all_features_scaled)
 print(f"Number of components selected by Kaiser Criterion: {kaiser_criterion}")
 # %%
 # %%