From a7006350705f11090e7a4790da810122e2f779fe Mon Sep 17 00:00:00 2001
From: chronos_secgrp_pytorch_oss_ci_oncall
 <chronos_secgrp_pytorch_oss_ci_oncall@twshared57365.06.ldc1.facebook.com>
Date: Tue, 27 Jun 2023 04:30:58 -0700
Subject: [PATCH] 2023-06-27 nightly release
 (105b77fe346e7a1267e8319073a9353a1b45f395)

---
 .../audio_feature_extractions_tutorial.py     | 182 +++++++++++++-----
 1 file changed, 131 insertions(+), 51 deletions(-)

diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py
index c8c5688aee..a90c6d6c61 100644
--- a/examples/tutorials/audio_feature_extractions_tutorial.py
+++ b/examples/tutorials/audio_feature_extractions_tutorial.py
@@ -41,6 +41,7 @@
 #       !pip install librosa
 #
 from IPython.display import Audio
+from matplotlib.patches import Rectangle
 from torchaudio.utils import download_asset
 
 torch.random.manual_seed(0)
@@ -48,26 +49,28 @@
 SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
 
 
-def plot_waveform(waveform, sr, title="Waveform"):
+def plot_waveform(waveform, sr, title="Waveform", ax=None):
     waveform = waveform.numpy()
 
     num_channels, num_frames = waveform.shape
     time_axis = torch.arange(0, num_frames) / sr
 
-    figure, axes = plt.subplots(num_channels, 1)
-    axes.plot(time_axis, waveform[0], linewidth=1)
-    axes.grid(True)
-    figure.suptitle(title)
+    if ax is None:
+        _, ax = plt.subplots(num_channels, 1)
+    ax.plot(time_axis, waveform[0], linewidth=1)
+    ax.grid(True)
+    ax.set_xlim([0, time_axis[-1]])
+    ax.set_title(title)
     plt.show(block=False)
 
 
-def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Spectrogram (db)")
-    axs.set_ylabel(ylabel)
-    axs.set_xlabel("frame")
-    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
-    fig.colorbar(im, ax=axs)
+def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+    if title is not None:
+        ax.set_title(title)
+    ax.set_ylabel(ylabel)
+    ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
     plt.show(block=False)
 
 
@@ -102,77 +105,155 @@ def plot_fbank(fbank, title=None):
 # you can use :py:func:`torchaudio.transforms.Spectrogram`.
 #
 
+# Load audio
 SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
 
-plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
-Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
+# Define transform
+spectrogram = T.Spectrogram(n_fft=512)
 
+# Perform transform
+spec = spectrogram(SPEECH_WAVEFORM)
 
 ######################################################################
 #
 
-n_fft = 1024
-win_length = None
-hop_length = 512
+fig, axs = plt.subplots(2, 1)
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0])
+plot_spectrogram(spec[0], title="spectrogram", ax=axs[1])
+fig.tight_layout()
 
-# Define transform
-spectrogram = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-)
+######################################################################
+#
+
+Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
 
 ######################################################################
+# The effect of ``n_fft`` parameter
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The core of spectrogram computation is (short-term) Fourier transform,
+# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
+# definition of descrete Fourier transform.
+#
+# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
+#
+# (For the detail of Fourier transform, please refer to
+# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
+#
+# The value of ``n_fft`` determines the resolution of frequency axis.
+# However, with the higher ``n_fft`` value, the energy will be distributed
+# among more bins, so when you visualize it, it might look more blurry,
+# even thought they are higher resolution.
+#
+# The following illustrates this;
 #
 
-# Perform transform
-spec = spectrogram(SPEECH_WAVEFORM)
+######################################################################
+#
+# .. note::
+#
+#    ``hop_length`` determines the time axis resolution.
+#    By default, (i.e. ``hop_length=None`` and ``win_length=None``),
+#    the value of ``n_fft // 4`` is used.
+#    Here we use the same ``hop_length`` value across different ``n_fft``
+#    so that the visualization.
+
+n_ffts = [32, 128, 512, 2048]
+hop_length = 64
+
+specs = []
+for n_fft in n_ffts:
+    spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length)
+    spec = spectrogram(SPEECH_WAVEFORM)
+    specs.append(spec)
 
 ######################################################################
 #
 
-plot_spectrogram(spec[0], title="torchaudio")
+fig, axs = plt.subplots(len(specs), 1, sharex=True)
+for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)):
+    plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i])
+    axs[i].set_xlabel(None)
+fig.tight_layout()
 
 ######################################################################
-# GriffinLim
-# ----------
 #
-# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
+# When comparing signals, it is desirable to use the same sampling rate,
+# however if you must use the different sampling rate, care must be
+# taken for interpretating the meaning of ``n_fft``.
+# ``n_fft`` determines the resolution of the frequency, and what
+# each frequency bin represents is subject to the sampling rate.
 #
+# As we have seen above, changing the value of ``n_fft`` does not change
+# the coverage of frequency range.
 
-torch.random.manual_seed(0)
+######################################################################
+#
+# Let's downsample the audio and apply spectrogram with the same ``n_fft``
+# value.
 
-n_fft = 1024
-win_length = None
-hop_length = 512
+# Downsample to half of the original sample rate
+speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2)
+# Upsample to the original sample rate
+speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE)
 
-spec = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)(SPEECH_WAVEFORM)
+######################################################################
+#
+
+# Apply the same spectrogram
+spectrogram = T.Spectrogram(n_fft=512)
+
+spec0 = spectrogram(SPEECH_WAVEFORM)
+spec2 = spectrogram(speech2)
+spec3 = spectrogram(speech3)
 
 ######################################################################
 #
 
-griffin_lim = T.GriffinLim(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)
+# Visualize it
+fig, axs = plt.subplots(3, 1)
+plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0])
+axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none"))
+plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1])
+plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2])
+fig.tight_layout()
 
 ######################################################################
 #
+# In the above visualization, the second plot ("Downsampled") might
+# give the impression that the spectrogram is streched.
+# This is because the meaning of frequency bins is different from
+# the original one.
+# Even though, they have the same number of bins, in the second plot,
+# the frequency is only covered to the half of the original sampling
+# rate.
+# This becomes more clear if we resample the downsampled signal again
+# so that it has the same sample rate as the original.
 
+######################################################################
+# GriffinLim
+# ----------
+#
+# To recover a waveform from a spectrogram, you can use
+# :py:class:`torchaudio.transforms.GriffinLim`.
+#
+# The same set of parameters used for spectrogram must be used.
+
+# Define transforms
+n_fft = 1024
+spectrogram = T.Spectrogram(n_fft=n_fft)
+griffin_lim = T.GriffinLim(n_fft=n_fft)
+
+# Apply the transforms
+spec = spectrogram(SPEECH_WAVEFORM)
 reconstructed_waveform = griffin_lim(spec)
 
 ######################################################################
 #
 
-plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
+_, axes = plt.subplots(2, 1, sharex=True, sharey=True)
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0])
+plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1])
 Audio(reconstructed_waveform, rate=SAMPLE_RATE)
 
 ######################################################################
@@ -254,7 +335,6 @@ def plot_fbank(fbank, title=None):
     pad_mode="reflect",
     power=2.0,
     norm="slaney",
-    onesided=True,
     n_mels=n_mels,
     mel_scale="htk",
 )
@@ -323,7 +403,7 @@ def plot_fbank(fbank, title=None):
 ######################################################################
 #
 
-plot_spectrogram(mfcc[0])
+plot_spectrogram(mfcc[0], title="MFCC")
 
 ######################################################################
 # Comparison against librosa
@@ -351,7 +431,7 @@ def plot_fbank(fbank, title=None):
 ######################################################################
 #
 
-plot_spectrogram(mfcc_librosa)
+plot_spectrogram(mfcc_librosa, title="MFCC (librosa)")
 
 mse = torch.square(mfcc - mfcc_librosa).mean().item()
 print("Mean Square Difference: ", mse)
@@ -377,7 +457,7 @@ def plot_fbank(fbank, title=None):
 )
 
 lfcc = lfcc_transform(SPEECH_WAVEFORM)
-plot_spectrogram(lfcc[0])
+plot_spectrogram(lfcc[0], title="LFCC")
 
 ######################################################################
 # Pitch