In [1]:
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Audio
from pydub import AudioSegment
import warnings
warnings.filterwarnings("ignore")
In [2]:
audio = AudioSegment.from_file("Cold.m4a", format="m4a")
audio.export("Cold.wav", format="wav")
Audio('Cold.wav')
Out[2]:
In [3]:
audio = AudioSegment.from_file("Hot.m4a", format="m4a")
audio.export("Hot.wav", format="wav")
Audio('Hot.wav')
Out[3]:
MFCC the layer 1,4 and 7¶
In [4]:
# Before 5.5 seconds is cold, between 6 and 12.5 is hot, after 12.5 is cold.
filename = 'Coldhotcold.m4a'
y, sr = librosa.load(filename)
duration = librosa.get_duration(y=y, sr=sr)
# Extract 8 MFCC coefficients
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=8)
window_size = 5 # Adjust the window size for smoothing
smoothed_mfccs = np.apply_along_axis(
lambda x: np.convolve(x, np.ones(window_size) / window_size, mode='same'), axis=1, arr=mfccs
)
plt.figure(figsize=(20, 4))
plt.imshow(smoothed_mfccs, aspect='auto', origin='lower', extent=[0, duration, 0, mfccs.shape[0]], cmap='tab20')
plt.colorbar(label="MFCC Coefficient Value")
plt.xlabel("Time (seconds)")
plt.ylabel("MFCC Coefficient Index")
duration = librosa.get_duration(y=y, sr=sr)
plt.xticks(np.arange(0, duration, step=1))
plt.vlines(6, 0, 8, color='red')
plt.vlines(12.5, 0, 8, color='red')
plt.hlines(4, 0, duration, color='red')
plt.hlines(3, 0, duration, color='red')
plt.title("MFCCs Over Time")
plt.show()
spectral centroid of the sound¶
In [5]:
# Load an audio signal
filename = 'Coldhotcold.m4a'
y, sr = librosa.load(filename)
# Compute the spectral centroid
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
# Smooth the spectral centroid using a moving average
window_size = 50 # Size of the sliding window
smoothed_centroid = np.convolve(spectral_centroids, np.ones(window_size) / window_size, mode='valid')
# Convert frame indices to time
frames = range(len(spectral_centroids))
time = librosa.frames_to_time(frames, sr=sr)
# Trim time to match the smoothed centroid length
time_smooth = time[:len(smoothed_centroid)]
# Plot the original and smoothed spectral centroid
plt.figure(figsize=(10, 6))
plt.plot(time, spectral_centroids, label='Original Spectral Centroid', alpha=0.6)
plt.plot(time_smooth, smoothed_centroid, label='Smoothed Spectral Centroid', color='red')
plt.vlines(6, 0, 7000, color='red')
plt.vlines(12.5, 0, 7000, color='red')
plt.title('Spectral Centroid Smoothing using Moving Average')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.legend()
plt.show()
Spectogram of the sound¶
In [6]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
# Load audio file
filename = 'Coldhotcold.m4a'
y, sr = librosa.load(filename)
# Compute the Short-Time Fourier Transform (STFT)
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
# Plot the spectrogram
plt.figure(figsize=(10, 6))
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='linear', cmap='tab20')
plt.vlines(6, 0, 10000, color='green')
plt.vlines(12.5, 0, 10000, color='green')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram (Linear Frequency)')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.show()
Plot Histogram of Hot and Cold¶
In [7]:
from typing import List
import librosa
filenames = ['Cold.m4a', 'Hot.m4a']
file_numbers = len(filenames)
mfcc_layers = 8
mfcc_list = [[0 for _ in range(mfcc_layers)] for _ in range(file_numbers)]
durations: List[float] = []
for i, filename in enumerate(filenames):
y, sr = librosa.load(filename)
durations.append(librosa.get_duration(y=y, sr=sr))
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=mfcc_layers)
window_size = 1 # Adjust the window size for smoothing
mfccs = np.apply_along_axis(
lambda x: np.convolve(x, np.ones(window_size) / window_size, mode='same'), axis=1, arr=mfccs
)
for j in range(mfcc_layers):
mfcc_list[i][j] = mfccs[j]
#plot hist of each mfcc layer
figs, axes = plt.subplots(nrows=mfcc_layers, ncols=1, figsize=(20, 5 * mfcc_layers))
plt.subplots_adjust(hspace=0.6, wspace=0.5)
for i in range(mfcc_layers):
for j in range(file_numbers):
ax_show = axes[i].hist(mfcc_list[j][i], bins=50, alpha=0.5, color='blue' if j == 0 else 'red')
axes[i].set_xlabel("MFCC Coefficient Value")
axes[i].set_ylabel("Count")
axes[i].set_title("MFCCs Histogram for Layer " + str(i + 1))
plt.show()
Use MFCC layer 7 and layer 8 to scatter point for cold and hot¶
In [8]:
from copy import deepcopy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
mfcc_copy = deepcopy(mfcc_list)
plt.figure(figsize=(20, 20))
x_layer = 7
y_layer = 6
plt.scatter(scaler.fit_transform(pd.DataFrame(mfcc_list[1][x_layer])),
scaler.fit_transform(pd.DataFrame(mfcc_list[1][y_layer])), color='red', label='Hot', alpha=0.5)
plt.scatter(scaler.fit_transform(pd.DataFrame(mfcc_list[0][x_layer])),
scaler.fit_transform(pd.DataFrame(mfcc_list[0][y_layer])), color='blue', label='Cold', alpha=0.5)
plt.show()
In [9]:
import matplotlib.image as mpimg
plt.figure(figsize=(10, 10))
image = mpimg.imread('3Dshow.png') # Replace with your image path
plt.imshow(image)
plt.axis('off') # Remove axes
plt.gca().set_axis_off() # Further ensure no borders
plt.subplots_adjust(left=0, right=1, top=1, bottom=0) # Remove white padding
plt.show()
In [10]:
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Function to extract MFCCs from an audio file
def extract_mfcc_features(filename, target: int, n_mfcc=8):
y, sr = librosa.load(filename)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
df = pd.DataFrame(mfccs.T)
df['target'] = target
return df
# Load your data (list of audio filenames and their labels)
audio_files = ['Cold.m4a', 'Hot.m4a'] # Replace with your files
labels = [0, 1] # Corresponding targets for each file
# Extract features and prepare the dataset
df_cold = extract_mfcc_features("Cold.m4a", target=0)
df_hot = extract_mfcc_features("Hot.m4a", target=1)
features = pd.concat([df_cold, df_hot], ignore_index=True)
targets = features.pop('target')
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42, stratify=targets,
shuffle=True)
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
indices = np.argsort(model.feature_importances_)
sorted_features = features.columns[indices]
print(f"mfcc layer importance is: {sorted_features}")
Accuracy: 0.9947780678851175 Classification Report: precision recall f1-score support 0 1.00 0.99 0.99 168 1 0.99 1.00 1.00 215 accuracy 0.99 383 macro avg 1.00 0.99 0.99 383 weighted avg 0.99 0.99 0.99 383 mfcc layer importance is: Index([2, 4, 1, 7, 0, 3, 6, 5], dtype='object')
Test with the sound¶
In [11]:
filename = 'Coldhotcold.m4a'
n_mfcc = 8
y, sr = librosa.load(filename)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
df = pd.DataFrame(mfccs.T)
y_pred = model.predict(df)
In [12]:
plt.figure(figsize=(20, 6))
plt.scatter(np.arange(len(y_pred))*0.023, y_pred)
plt.show()