From a7006350705f11090e7a4790da810122e2f779fe Mon Sep 17 00:00:00 2001 From: chronos_secgrp_pytorch_oss_ci_oncall Date: Tue, 27 Jun 2023 04:30:58 -0700 Subject: [PATCH] 2023-06-27 nightly release (105b77fe346e7a1267e8319073a9353a1b45f395) --- .../audio_feature_extractions_tutorial.py | 182 +++++++++++++----- 1 file changed, 131 insertions(+), 51 deletions(-) diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py index c8c5688aee..a90c6d6c61 100644 --- a/examples/tutorials/audio_feature_extractions_tutorial.py +++ b/examples/tutorials/audio_feature_extractions_tutorial.py @@ -41,6 +41,7 @@ # !pip install librosa # from IPython.display import Audio +from matplotlib.patches import Rectangle from torchaudio.utils import download_asset torch.random.manual_seed(0) @@ -48,26 +49,28 @@ SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav") -def plot_waveform(waveform, sr, title="Waveform"): +def plot_waveform(waveform, sr, title="Waveform", ax=None): waveform = waveform.numpy() num_channels, num_frames = waveform.shape time_axis = torch.arange(0, num_frames) / sr - figure, axes = plt.subplots(num_channels, 1) - axes.plot(time_axis, waveform[0], linewidth=1) - axes.grid(True) - figure.suptitle(title) + if ax is None: + _, ax = plt.subplots(num_channels, 1) + ax.plot(time_axis, waveform[0], linewidth=1) + ax.grid(True) + ax.set_xlim([0, time_axis[-1]]) + ax.set_title(title) plt.show(block=False) -def plot_spectrogram(specgram, title=None, ylabel="freq_bin"): - fig, axs = plt.subplots(1, 1) - axs.set_title(title or "Spectrogram (db)") - axs.set_ylabel(ylabel) - axs.set_xlabel("frame") - im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto") - fig.colorbar(im, ax=axs) +def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None): + if ax is None: + _, ax = plt.subplots(1, 1) + if title is not None: + ax.set_title(title) + ax.set_ylabel(ylabel) + ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest") plt.show(block=False) @@ -102,77 +105,155 @@ def plot_fbank(fbank, title=None): # you can use :py:func:`torchaudio.transforms.Spectrogram`. # +# Load audio SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH) -plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform") -Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE) +# Define transform +spectrogram = T.Spectrogram(n_fft=512) +# Perform transform +spec = spectrogram(SPEECH_WAVEFORM) ###################################################################### # -n_fft = 1024 -win_length = None -hop_length = 512 +fig, axs = plt.subplots(2, 1) +plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0]) +plot_spectrogram(spec[0], title="spectrogram", ax=axs[1]) +fig.tight_layout() -# Define transform -spectrogram = T.Spectrogram( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, - center=True, - pad_mode="reflect", - power=2.0, -) +###################################################################### +# + +Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE) ###################################################################### +# The effect of ``n_fft`` parameter +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The core of spectrogram computation is (short-term) Fourier transform, +# and the ``n_fft`` parameter corresponds to the :math:`N` in the following +# definition of descrete Fourier transform. +# +# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$ +# +# (For the detail of Fourier transform, please refer to +# `Wikipedia `__. +# +# The value of ``n_fft`` determines the resolution of frequency axis. +# However, with the higher ``n_fft`` value, the energy will be distributed +# among more bins, so when you visualize it, it might look more blurry, +# even thought they are higher resolution. +# +# The following illustrates this; # -# Perform transform -spec = spectrogram(SPEECH_WAVEFORM) +###################################################################### +# +# .. note:: +# +# ``hop_length`` determines the time axis resolution. +# By default, (i.e. ``hop_length=None`` and ``win_length=None``), +# the value of ``n_fft // 4`` is used. +# Here we use the same ``hop_length`` value across different ``n_fft`` +# so that the visualization. + +n_ffts = [32, 128, 512, 2048] +hop_length = 64 + +specs = [] +for n_fft in n_ffts: + spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length) + spec = spectrogram(SPEECH_WAVEFORM) + specs.append(spec) ###################################################################### # -plot_spectrogram(spec[0], title="torchaudio") +fig, axs = plt.subplots(len(specs), 1, sharex=True) +for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)): + plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i]) + axs[i].set_xlabel(None) +fig.tight_layout() ###################################################################### -# GriffinLim -# ---------- # -# To recover a waveform from a spectrogram, you can use ``GriffinLim``. +# When comparing signals, it is desirable to use the same sampling rate, +# however if you must use the different sampling rate, care must be +# taken for interpretating the meaning of ``n_fft``. +# ``n_fft`` determines the resolution of the frequency, and what +# each frequency bin represents is subject to the sampling rate. # +# As we have seen above, changing the value of ``n_fft`` does not change +# the coverage of frequency range. -torch.random.manual_seed(0) +###################################################################### +# +# Let's downsample the audio and apply spectrogram with the same ``n_fft`` +# value. -n_fft = 1024 -win_length = None -hop_length = 512 +# Downsample to half of the original sample rate +speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2) +# Upsample to the original sample rate +speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE) -spec = T.Spectrogram( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, -)(SPEECH_WAVEFORM) +###################################################################### +# + +# Apply the same spectrogram +spectrogram = T.Spectrogram(n_fft=512) + +spec0 = spectrogram(SPEECH_WAVEFORM) +spec2 = spectrogram(speech2) +spec3 = spectrogram(speech3) ###################################################################### # -griffin_lim = T.GriffinLim( - n_fft=n_fft, - win_length=win_length, - hop_length=hop_length, -) +# Visualize it +fig, axs = plt.subplots(3, 1) +plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0]) +axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none")) +plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1]) +plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2]) +fig.tight_layout() ###################################################################### # +# In the above visualization, the second plot ("Downsampled") might +# give the impression that the spectrogram is streched. +# This is because the meaning of frequency bins is different from +# the original one. +# Even though, they have the same number of bins, in the second plot, +# the frequency is only covered to the half of the original sampling +# rate. +# This becomes more clear if we resample the downsampled signal again +# so that it has the same sample rate as the original. +###################################################################### +# GriffinLim +# ---------- +# +# To recover a waveform from a spectrogram, you can use +# :py:class:`torchaudio.transforms.GriffinLim`. +# +# The same set of parameters used for spectrogram must be used. + +# Define transforms +n_fft = 1024 +spectrogram = T.Spectrogram(n_fft=n_fft) +griffin_lim = T.GriffinLim(n_fft=n_fft) + +# Apply the transforms +spec = spectrogram(SPEECH_WAVEFORM) reconstructed_waveform = griffin_lim(spec) ###################################################################### # -plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed") +_, axes = plt.subplots(2, 1, sharex=True, sharey=True) +plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0]) +plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1]) Audio(reconstructed_waveform, rate=SAMPLE_RATE) ###################################################################### @@ -254,7 +335,6 @@ def plot_fbank(fbank, title=None): pad_mode="reflect", power=2.0, norm="slaney", - onesided=True, n_mels=n_mels, mel_scale="htk", ) @@ -323,7 +403,7 @@ def plot_fbank(fbank, title=None): ###################################################################### # -plot_spectrogram(mfcc[0]) +plot_spectrogram(mfcc[0], title="MFCC") ###################################################################### # Comparison against librosa @@ -351,7 +431,7 @@ def plot_fbank(fbank, title=None): ###################################################################### # -plot_spectrogram(mfcc_librosa) +plot_spectrogram(mfcc_librosa, title="MFCC (librosa)") mse = torch.square(mfcc - mfcc_librosa).mean().item() print("Mean Square Difference: ", mse) @@ -377,7 +457,7 @@ def plot_fbank(fbank, title=None): ) lfcc = lfcc_transform(SPEECH_WAVEFORM) -plot_spectrogram(lfcc[0]) +plot_spectrogram(lfcc[0], title="LFCC") ###################################################################### # Pitch