From a02cd4e65443a691e752ed4ac815d34590d4f233 Mon Sep 17 00:00:00 2001
From: chronos_secgrp_pytorch_oss_ci_oncall
 <chronos_secgrp_pytorch_oss_ci_oncall@twshared13551.08.pnb1.facebook.com>
Date: Thu, 6 Jul 2023 04:30:47 -0700
Subject: [PATCH] 2023-07-06 nightly release
 (ca66a1d3a0a000031ee0927c17c5b223dc119077)

---
 .github/workflows/ffmpeg.yml                  |  82 +++++
 CMakeLists.txt                                |   2 +-
 .../ctc_forced_alignment_api_tutorial.py      |  38 +--
 ...lignment_for_multilingual_data_tutorial.py |  41 +--
 packaging/ffmpeg/build.sh                     |  32 +-
 .../functional/functional_impl.py             |  85 ++---
 third_party/CMakeLists.txt                    |  11 -
 tools/setup_helpers/extension.py              |   2 -
 torchaudio/csrc/ffmpeg/CMakeLists.txt         |  18 +-
 torchaudio/csrc/ffmpeg/ffmpeg.cpp             |  41 +--
 torchaudio/csrc/ffmpeg/ffmpeg.h               |   5 +-
 torchaudio/csrc/ffmpeg/filter_graph.cpp       |  40 ++-
 torchaudio/csrc/ffmpeg/hw_context.cpp         |   3 +-
 torchaudio/csrc/ffmpeg/pybind/pybind.cpp      |  50 ++-
 .../csrc/ffmpeg/stream_reader/conversion.cpp  |  13 +-
 .../ffmpeg/stream_reader/packet_buffer.cpp    |   4 +-
 .../ffmpeg/stream_reader/post_process.cpp     |  37 +--
 .../ffmpeg/stream_reader/stream_processor.cpp |  43 ++-
 .../ffmpeg/stream_reader/stream_reader.cpp    |  47 ++-
 .../ffmpeg/stream_writer/encode_process.cpp   |  94 +++---
 .../csrc/ffmpeg/stream_writer/encoder.cpp     |  12 +-
 .../ffmpeg/stream_writer/packet_writer.cpp    |  14 +-
 .../ffmpeg/stream_writer/stream_writer.cpp    |  32 +-
 .../ffmpeg/stream_writer/tensor_converter.cpp |  28 +-
 torchaudio/csrc/ffmpeg/stub.cpp               | 196 -----------
 torchaudio/csrc/ffmpeg/stub.h                 | 313 ------------------
 torchaudio/csrc/forced_align/cpu/compute.cpp  |  71 ++--
 torchaudio/csrc/forced_align/gpu/compute.cu   |  83 +++--
 torchaudio/functional/functional.py           |  11 +-
 29 files changed, 510 insertions(+), 938 deletions(-)
 create mode 100644 .github/workflows/ffmpeg.yml
 delete mode 100644 third_party/CMakeLists.txt
 delete mode 100644 torchaudio/csrc/ffmpeg/stub.cpp
 delete mode 100644 torchaudio/csrc/ffmpeg/stub.h

diff --git a/.github/workflows/ffmpeg.yml b/.github/workflows/ffmpeg.yml
new file mode 100644
index 0000000000..cc39217e65
--- /dev/null
+++ b/.github/workflows/ffmpeg.yml
@@ -0,0 +1,82 @@
+# This job is not directly related to regular CI pipeline.
+# It is intended to create FFmpeg binaries that we upload on S3,
+# which then will be used during all the build process in CI or local.
+#
+# This job does not include uploading part.
+# Upload needs to be done manually, and it should be done only once
+# par new major release of FFmepg.
+name: FFmpeg Binaries
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0'  # on sunday
+
+jobs:
+  Linux-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for Linux
+      upload-artifact: ffmpeg-linux-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux/"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  macOS-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+        runner: ["macos-m1-12", "macos-12"]
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for macOS ("${{ matrix.runner }}")
+      upload-artifact: ffmpeg-macos-lgpl
+      repository: pytorch/audio
+      runner: "${{ matrix.runner }}"
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/macos_$(uname -m)"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  Windows-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for Windows
+      upload-artifact: ffmpeg-windows-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.bat
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/bin
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/windows"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e45a134ed..7649c92050 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,9 +165,9 @@ else()
   message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
 endif()
 
-add_subdirectory(third_party)
 add_subdirectory(torchaudio/csrc)
 if (BUILD_SOX)
+  add_subdirectory(third_party/sox)
   add_subdirectory(torchaudio/csrc/sox)
 endif()
 if (USE_FFMPEG)
diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
index be8fc27d7a..a0d3d7acb7 100644
--- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
@@ -96,7 +96,7 @@
     emissions, _ = model(waveform.to(device))
     emissions = torch.log_softmax(emissions, dim=-1)
 
-emission = emissions[0].cpu().detach()
+emission = emissions.cpu().detach()
 dictionary = {c: i for i, c in enumerate(labels)}
 
 print(dictionary)
@@ -107,7 +107,7 @@
 # ^^^^^^^^^^^^^
 #
 
-plt.imshow(emission.T)
+plt.imshow(emission[0].T)
 plt.colorbar()
 plt.title("Frame-wise class probabilities")
 plt.xlabel("Time")
@@ -205,27 +205,27 @@ def compute_alignments(transcript, dictionary, emission):
     frames = []
     tokens = [dictionary[c] for c in transcript.replace(" ", "")]
 
-    targets = torch.tensor(tokens, dtype=torch.int32)
-    input_lengths = torch.tensor(emission.shape[0])
-    target_lengths = torch.tensor(targets.shape[0])
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
 
     # This is the key step, where we call the forced alignment API functional.forced_align to compute alignments.
     frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
 
-    assert len(frame_alignment) == input_lengths.item()
-    assert len(targets) == target_lengths.item()
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
 
     token_index = -1
     prev_hyp = 0
-    for i in range(len(frame_alignment)):
-        if frame_alignment[i].item() == 0:
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
             prev_hyp = 0
             continue
 
-        if frame_alignment[i].item() != prev_hyp:
+        if frame_alignment[0][i].item() != prev_hyp:
             token_index += 1
-        frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
-        prev_hyp = frame_alignment[i].item()
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
     return frames, frame_alignment, frame_scores
 
 
@@ -390,7 +390,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     plt.rcParams.update({"font.size": 30})
 
     # The original waveform
-    ratio = waveform.size(0) / input_lengths
+    ratio = waveform.size(1) / input_lengths
     ax2.plot(waveform)
     ax2.set_ylim(-1.0 * scale, 1.0 * scale)
     ax2.set_xlim(0, waveform.size(-1))
@@ -414,8 +414,8 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
 plot_alignments(
     segments,
     word_segments,
-    waveform[0],
-    emission.shape[0],
+    waveform,
+    emission.shape[1],
     1,
 )
 plt.show()
@@ -428,7 +428,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
 # `IPython.display.Audio` has to be the last call in a cell,
 # and there should be only one call par cell.
 def display_segment(i, waveform, word_segments, frame_alignment):
-    ratio = waveform.size(1) / len(frame_alignment)
+    ratio = waveform.size(1) / frame_alignment.size(1)
     word = word_segments[i]
     x0 = int(ratio * word.start)
     x1 = int(ratio * word.end)
@@ -511,19 +511,19 @@ def display_segment(i, waveform, word_segments, frame_alignment):
     # Append the extra dimension corresponding to the <star> token
     extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
     emissions = torch.cat((emissions.cpu(), extra_dim), 2)
-    emission = emissions[0].detach()
+    emission = emissions.detach()
 
 # Extend the dictionary to include the <star> token.
 dictionary["*"] = 29
 
-assert len(dictionary) == emission.shape[1]
+assert len(dictionary) == emission.shape[2]
 
 
 def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
     frames, frame_alignment, _ = compute_alignments(transcript, dictionary, emission)
     segments = merge_repeats(frames, transcript)
     word_segments = merge_words(transcript, segments, "|")
-    plot_alignments(segments, word_segments, waveform[0], emission.shape[0], 1)
+    plot_alignments(segments, word_segments, waveform, emission.shape[1], 1)
     plt.show()
     return word_segments, frame_alignment
 
diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
index 95251c6198..01333d7175 100644
--- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
+++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
@@ -90,27 +90,27 @@ def compute_alignments(transcript, dictionary, emission):
     frames = []
     tokens = [dictionary[c] for c in transcript.replace(" ", "")]
 
-    targets = torch.tensor(tokens, dtype=torch.int32)
-    input_lengths = torch.tensor(emission.shape[0])
-    target_lengths = torch.tensor(targets.shape[0])
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
 
     # This is the key step, where we call the forced alignment API functional.forced_align to compute frame alignments.
     frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
 
-    assert len(frame_alignment) == input_lengths.item()
-    assert len(targets) == target_lengths.item()
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
 
     token_index = -1
     prev_hyp = 0
-    for i in range(len(frame_alignment)):
-        if frame_alignment[i].item() == 0:
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
             prev_hyp = 0
             continue
 
-        if frame_alignment[i].item() != prev_hyp:
+        if frame_alignment[0][i].item() != prev_hyp:
             token_index += 1
-        frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
-        prev_hyp = frame_alignment[i].item()
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
 
     # compute frame alignments from token alignments
     transcript_nospace = transcript.replace(" ", "")
@@ -150,7 +150,7 @@ def compute_alignments(transcript, dictionary, emission):
             i2 += 1
         i3 += 1
 
-    num_frames = len(frame_alignment)
+    num_frames = frame_alignment.shape[1]
     return segments, words, num_frames
 
 
@@ -160,7 +160,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     plt.rcParams.update({"font.size": 30})
 
     # The original waveform
-    ratio = waveform.size(0) / input_lengths
+    ratio = waveform.size(1) / input_lengths
     ax2.plot(waveform)
     ax2.set_ylim(-1.0 * scale, 1.0 * scale)
     ax2.set_xlim(0, waveform.size(-1))
@@ -249,12 +249,12 @@ def get_emission(waveform):
 
     emissions, _ = model(waveform)
     emissions = torch.log_softmax(emissions, dim=-1)
-    emission = emissions[0].cpu().detach()
+    emission = emissions.cpu().detach()
 
     # Append the extra dimension corresponding to the <star> token
     extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
     emissions = torch.cat((emissions.cpu(), extra_dim), 2)
-    emission = emissions[0].detach()
+    emission = emissions.detach()
     return emission, waveform
 
 
@@ -347,12 +347,12 @@ def get_emission(waveform):
 waveform, _ = torchaudio.load(speech_file)
 
 emission, waveform = get_emission(waveform)
-assert len(dictionary) == emission.shape[1]
+assert len(dictionary) == emission.shape[2]
 
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -482,13 +482,14 @@ def get_emission(waveform):
 text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
 speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False)
 waveform, _ = torchaudio.load(speech_file)
+waveform = waveform[0:1]
 
 emission, waveform = get_emission(waveform)
 
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -557,7 +558,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -660,7 +661,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -785,7 +786,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
diff --git a/packaging/ffmpeg/build.sh b/packaging/ffmpeg/build.sh
index 4ccdb6924b..7648830336 100755
--- a/packaging/ffmpeg/build.sh
+++ b/packaging/ffmpeg/build.sh
@@ -21,6 +21,8 @@ if [[ "$OSTYPE" == "msys" ]]; then
    args="--toolchain=msvc"
 fi
 
+archive="https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n${FFMPEG_VERSION:-4.1.8}.tar.gz"
+
 build_dir=$(mktemp -d -t ffmpeg-build.XXXXXXXXXX)
 cleanup() {
     rm -rf "${build_dir}"
@@ -32,7 +34,7 @@ cd "${build_dir}"
 # NOTE:
 # When changing the version of FFmpeg, update the README so that the link to the source points
 # the same version.
-curl -LsS -o ffmpeg.tar.gz https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n4.1.8.tar.gz
+curl -LsS -o ffmpeg.tar.gz "${archive}"
 tar -xf ffmpeg.tar.gz --strip-components 1
 ./configure \
     --prefix="${prefix}" \
@@ -72,11 +74,29 @@ ls ${prefix}/*
 # macOS: Fix rpath so that the libraries are searched dynamically in user environment.
 # In Linux, this is handled by `--enable-rpath` flag.
 if [[ "$(uname)" == Darwin ]]; then
-    avcodec=libavcodec.58
-    avdevice=libavdevice.58
-    avfilter=libavfilter.7
-    avformat=libavformat.58
-    avutil=libavutil.56
+    major_ver=${FFMPEG_VERSION:0:1}
+    if [[ ${major_ver} == 4 ]]; then
+        avutil=libavutil.56
+        avcodec=libavcodec.58
+        avformat=libavformat.58
+        avdevice=libavdevice.58
+        avfilter=libavfilter.7
+    elif [[ ${major_ver} == 5 ]]; then
+        avutil=libavutil.57
+        avcodec=libavcodec.59
+        avformat=libavformat.59
+        avdevice=libavdevice.59
+        avfilter=libavfilter.8
+    elif [[ ${major_ver} == 6 ]]; then
+        avutil=libavutil.58
+        avcodec=libavcodec.60
+        avformat=libavformat.60
+        avdevice=libavdevice.60
+        avfilter=libavfilter.9
+    else
+        printf "Error: unexpected FFmpeg major version: %s\n"  ${major_ver}
+        exit 1;
+    fi
 
     otool="/usr/bin/otool"
     # NOTE: miniconda has a version of otool and install_name_tool installed and we want
diff --git a/test/torchaudio_unittest/functional/functional_impl.py b/test/torchaudio_unittest/functional/functional_impl.py
index d7847c034f..6bb6a9f8bf 100644
--- a/test/torchaudio_unittest/functional/functional_impl.py
+++ b/test/torchaudio_unittest/functional/functional_impl.py
@@ -1116,55 +1116,60 @@ def test_preemphasis_deemphasis_roundtrip(self, input_shape, coeff):
 
     @parameterized.expand(
         [
-            ([0, 1, 1, 0], [0, 1, 5, 1, 0], torch.int32),
-            ([0, 1, 2, 3, 4], [0, 1, 2, 3, 4], torch.int32),
-            ([3, 3, 3], [3, 5, 3, 5, 3], torch.int64),
-            ([0, 1, 2], [0, 1, 1, 1, 2], torch.int64),
+            ([[0, 1, 1, 0]], [[0, 1, 5, 1, 0]], torch.int32),
+            ([[0, 1, 2, 3, 4]], [[0, 1, 2, 3, 4]], torch.int32),
+            ([[3, 3, 3]], [[3, 5, 3, 5, 3]], torch.int64),
+            ([[0, 1, 2]], [[0, 1, 1, 1, 2]], torch.int64),
         ]
     )
     def test_forced_align(self, targets, ref_path, targets_dtype):
         emission = torch.tensor(
             [
-                [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
-                [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
-                [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
-                [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
-                [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107],
+                [
+                    [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+                    [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
+                    [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+                    [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+                    [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107],
+                ]
             ],
             dtype=self.dtype,
             device=self.device,
         )
         blank = 5
+        batch_index = 0
         ref_path = torch.tensor(ref_path, dtype=targets_dtype, device=self.device)
         ref_scores = torch.tensor(
-            [torch.log(emission[i, ref_path[i]]).item() for i in range(emission.shape[0])],
+            [torch.log(emission[batch_index, i, ref_path[batch_index, i]]).item() for i in range(emission.shape[1])],
             dtype=emission.dtype,
             device=self.device,
-        )
+        ).unsqueeze(0)
         log_probs = torch.log(emission)
         targets = torch.tensor(targets, dtype=targets_dtype, device=self.device)
-        input_lengths = torch.tensor((log_probs.shape[0]))
-        target_lengths = torch.tensor((targets.shape[0]))
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+        assert hyp_path.shape == ref_path.shape
+        assert hyp_scores.shape == ref_scores.shape
         self.assertEqual(hyp_path, ref_path)
         self.assertEqual(hyp_scores, ref_scores)
 
     @parameterized.expand([(torch.int32,), (torch.int64,)])
     def test_forced_align_fail(self, targets_dtype):
-        log_probs = torch.rand(5, 6, dtype=self.dtype, device=self.device)
-        targets = torch.tensor([0, 1, 2, 3, 4, 4], dtype=targets_dtype, device=self.device)
+        log_probs = torch.rand(1, 5, 6, dtype=self.dtype, device=self.device)
+        targets = torch.tensor([[0, 1, 2, 3, 4, 4]], dtype=targets_dtype, device=self.device)
         blank = 5
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"targets length is too long for CTC"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([5, 3, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[5, 3, 3]], dtype=targets_dtype, device=self.device)
         with self.assertRaisesRegex(ValueError, r"targets Tensor shouldn't contain blank index"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
         log_probs = log_probs.int()
-        targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"log_probs must be float64, float32 or float16"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
@@ -1175,40 +1180,42 @@ def test_forced_align_fail(self, targets_dtype):
 
         log_probs = torch.rand(3, 4, 6, dtype=self.dtype, device=self.device)
         targets = targets.int()
-        with self.assertRaisesRegex(RuntimeError, r"3-D tensor is not yet supported for log_probs"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"The batch dimension for log_probs must be 1 at the current version"
+        ):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
         targets = torch.randint(0, 4, (3, 4), device=self.device)
-        log_probs = torch.rand(3, 6, dtype=self.dtype, device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"2-D tensor is not yet supported for targets"):
+        log_probs = torch.rand(1, 3, 6, dtype=self.dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"The batch dimension for targets must be 1 at the current version"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device)
-        input_lengths = torch.randint(1, 5, (3,), device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 0-D"):
+        targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device)
+        input_lengths = torch.randint(1, 5, (3, 5), device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 1-D"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.randint(1, 5, (3,), device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 0-D"):
+        input_lengths = torch.tensor([log_probs.shape[0]], device=self.device)
+        target_lengths = torch.randint(1, 5, (3, 5), device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 1-D"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((10000), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([10000], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"input length mismatch"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((log_probs.shape[0]))
-        target_lengths = torch.tensor((10000))
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([10000], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"target length mismatch"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([7, 8, 9, 10], dtype=targets_dtype, device=self.device)
-        log_probs = torch.rand(10, 5, dtype=self.dtype, device=self.device)
+        targets = torch.tensor([[7, 8, 9, 10]], dtype=targets_dtype, device=self.device)
+        log_probs = torch.rand(1, 10, 5, dtype=self.dtype, device=self.device)
         with self.assertRaisesRegex(ValueError, r"targets values must be less than the CTC dimension"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([1, 3, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[1, 3, 3]], dtype=targets_dtype, device=self.device)
         blank = 10000
         with self.assertRaisesRegex(RuntimeError, r"blank must be within \[0, num classes\)"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
@@ -1238,14 +1245,14 @@ class FunctionalCUDAOnly(TestBaseMixin):
     @nested_params(
         [torch.half, torch.float, torch.double],
         [torch.int32, torch.int64],
-        [(50, 100), (100, 100)],
-        [(10,), (40,), (45,)],
+        [(1, 50, 100), (1, 100, 100)],
+        [(1, 10), (1, 40), (1, 45)],
     )
     def test_forced_align_same_result(self, log_probs_dtype, targets_dtype, log_probs_shape, targets_shape):
         log_probs = torch.rand(log_probs_shape, dtype=log_probs_dtype, device=self.device)
         targets = torch.randint(1, 100, targets_shape, dtype=targets_dtype, device=self.device)
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         log_probs_cuda = log_probs.cuda()
         targets_cuda = targets.cuda()
         input_lengths_cuda = input_lengths.cuda()
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
deleted file mode 100644
index 07cd3c9d4b..0000000000
--- a/third_party/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
-
-file(MAKE_DIRECTORY install/include)
-file(MAKE_DIRECTORY install/lib)
-
-################################################################################
-# sox
-################################################################################
-if (BUILD_SOX)
-  add_subdirectory(sox)
-endif()
diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py
index 95b59c207c..2e5610bc79 100644
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -37,7 +37,6 @@ def _get_build(var, default=False):
 _BUILD_RIR = _get_build("BUILD_RIR", True)
 _BUILD_RNNT = _get_build("BUILD_RNNT", True)
 _USE_FFMPEG = _get_build("USE_FFMPEG", False)
-_DLOPEN_FFMPEG = _get_build("DLOPEN_FFMPEG", False)
 _USE_ROCM = _get_build("USE_ROCM", torch.backends.cuda.is_built() and torch.version.hip is not None)
 _USE_CUDA = _get_build("USE_CUDA", torch.backends.cuda.is_built() and torch.version.hip is None)
 _BUILD_ALIGN = _get_build("BUILD_ALIGN", True)
@@ -125,7 +124,6 @@ def build_extension(self, ext):
             f"-DUSE_CUDA:BOOL={'ON' if _USE_CUDA else 'OFF'}",
             f"-DUSE_OPENMP:BOOL={'ON' if _USE_OPENMP else 'OFF'}",
             f"-DUSE_FFMPEG:BOOL={'ON' if _USE_FFMPEG else 'OFF'}",
-            f"-DDLOPEN_FFMPEG:BOOL={'ON' if _DLOPEN_FFMPEG else 'OFF'}",
         ]
         build_args = ["--target", "install"]
         # Pass CUDA architecture to cmake
diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt
index 849d83d62f..e3445265b5 100644
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -2,13 +2,11 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
 find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
 add_library(ffmpeg INTERFACE)
 target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}")
-if (NOT DLOPEN_FFMPEG)
 target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}")
-endif()
+
 
 set(
   sources
-  stub.cpp
   ffmpeg.cpp
   filter_graph.cpp
   hw_context.cpp
@@ -33,24 +31,24 @@ if (USE_CUDA)
     cuda_deps)
 endif()
 
-if (DLOPEN_FFMPEG)
-  set(compile_definitions DLOPEN_FFMPEG)
-endif()
-
 torchaudio_library(
   libtorchaudio_ffmpeg
   "${sources}"
   ""
   "torch;ffmpeg;${additional_lib}"
-  "${compile_definitions}"
+  ""
   )
 
 if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
+  set(
+    ext_sources
+    pybind/pybind.cpp
+    )
   torchaudio_extension(
     _torchaudio_ffmpeg
-    pybind/pybind.cpp
+    "${ext_sources}"
     ""
     "libtorchaudio_ffmpeg"
-    "${compile_definitions}"
+    ""
     )
 endif ()
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
index 55e6c142b9..7822b30392 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -1,6 +1,5 @@
 #include <c10/util/Exception.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -8,12 +7,6 @@
 
 namespace torchaudio::io {
 
-std::string av_err2string(int errnum) {
-  char str[AV_ERROR_MAX_STRING_SIZE];
-  FFMPEG av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE);
-  return str;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
 ////////////////////////////////////////////////////////////////////////////////
@@ -21,7 +14,7 @@ AVDictionary* get_option_dict(const c10::optional<OptionDict>& option) {
   AVDictionary* opt = nullptr;
   if (option) {
     for (auto const& [key, value] : option.value()) {
-      FFMPEG av_dict_set(&opt, key.c_str(), value.c_str(), 0);
+      av_dict_set(&opt, key.c_str(), value.c_str(), 0);
     }
   }
   return opt;
@@ -32,10 +25,10 @@ void clean_up_dict(AVDictionary* p) {
     std::vector<std::string> unused_keys;
     // Check and copy unused keys, clean up the original dictionary
     AVDictionaryEntry* t = nullptr;
-    while ((t = FFMPEG av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
+    while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) {
       unused_keys.emplace_back(t->key);
     }
-    FFMPEG av_dict_free(&p);
+    av_dict_free(&p);
     TORCH_CHECK(
         unused_keys.empty(),
         "Unexpected options: ",
@@ -47,14 +40,14 @@ void clean_up_dict(AVDictionary* p) {
 // AVFormatContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVFormatInputContextDeleter::operator()(AVFormatContext* p) {
-  FFMPEG avformat_close_input(&p);
+  avformat_close_input(&p);
 };
 
 AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p)
     : Wrapper<AVFormatContext, AVFormatInputContextDeleter>(p) {}
 
 void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) {
-  FFMPEG avformat_free_context(p);
+  avformat_free_context(p);
 };
 
 AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
@@ -64,9 +57,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p)
 // AVIO
 ////////////////////////////////////////////////////////////////////////////////
 void AVIOContextDeleter::operator()(AVIOContext* p) {
-  FFMPEG avio_flush(p);
-  FFMPEG av_freep(&p->buffer);
-  FFMPEG av_freep(&p);
+  avio_flush(p);
+  av_freep(&p->buffer);
+  av_freep(&p);
 };
 
 AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
@@ -76,13 +69,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p)
 // AVPacket
 ////////////////////////////////////////////////////////////////////////////////
 void AVPacketDeleter::operator()(AVPacket* p) {
-  FFMPEG av_packet_free(&p);
+  av_packet_free(&p);
 };
 
 AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper<AVPacket, AVPacketDeleter>(p) {}
 
 AVPacketPtr alloc_avpacket() {
-  AVPacket* p = FFMPEG av_packet_alloc();
+  AVPacket* p = av_packet_alloc();
   TORCH_CHECK(p, "Failed to allocate AVPacket object.");
   return AVPacketPtr{p};
 }
@@ -92,7 +85,7 @@ AVPacketPtr alloc_avpacket() {
 ////////////////////////////////////////////////////////////////////////////////
 AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){};
 AutoPacketUnref::~AutoPacketUnref() {
-  FFMPEG av_packet_unref(p_);
+  av_packet_unref(p_);
 }
 AutoPacketUnref::operator AVPacket*() const {
   return p_;
@@ -102,13 +95,13 @@ AutoPacketUnref::operator AVPacket*() const {
 // AVFrame
 ////////////////////////////////////////////////////////////////////////////////
 void AVFrameDeleter::operator()(AVFrame* p) {
-  FFMPEG av_frame_free(&p);
+  av_frame_free(&p);
 };
 
 AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper<AVFrame, AVFrameDeleter>(p) {}
 
 AVFramePtr alloc_avframe() {
-  AVFrame* p = FFMPEG av_frame_alloc();
+  AVFrame* p = av_frame_alloc();
   TORCH_CHECK(p, "Failed to allocate AVFrame object.");
   return AVFramePtr{p};
 };
@@ -117,7 +110,7 @@ AVFramePtr alloc_avframe() {
 // AVCodecContext
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecContextDeleter::operator()(AVCodecContext* p) {
-  FFMPEG avcodec_free_context(&p);
+  avcodec_free_context(&p);
 };
 
 AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
@@ -127,7 +120,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p)
 // AVBufferRefPtr
 ////////////////////////////////////////////////////////////////////////////////
 void AutoBufferUnref::operator()(AVBufferRef* p) {
-  FFMPEG av_buffer_unref(&p);
+  av_buffer_unref(&p);
 }
 
 AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
@@ -137,7 +130,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p)
 // AVFilterGraph
 ////////////////////////////////////////////////////////////////////////////////
 void AVFilterGraphDeleter::operator()(AVFilterGraph* p) {
-  FFMPEG avfilter_graph_free(&p);
+  avfilter_graph_free(&p);
 };
 
 AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
@@ -147,7 +140,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p)
 // AVCodecParameters
 ////////////////////////////////////////////////////////////////////////////////
 void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
-  FFMPEG avcodec_parameters_free(&codecpar);
+  avcodec_parameters_free(&codecpar);
 }
 
 AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h
index 83d18464fa..0bae00c12d 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.h
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.h
@@ -41,7 +41,10 @@ using OptionDict = std::map<std::string, std::string>;
 // Replacement of av_err2str, which causes
 // `error: taking address of temporary array`
 // https://github.com/joncampbell123/composite-video-simulator/issues/5
-std::string av_err2string(int errnum);
+av_always_inline std::string av_err2string(int errnum) {
+  char str[AV_ERROR_MAX_STRING_SIZE];
+  return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
+}
 
 // Base structure that handles memory management.
 // Resource is freed by the destructor of unique_ptr,
diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
index faa3606e08..1a1e40b011 100644
--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -1,12 +1,11 @@
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 #include <stdexcept>
 
 namespace torchaudio::io {
 
 namespace {
 AVFilterGraph* get_filter_graph() {
-  AVFilterGraph* ptr = FFMPEG avfilter_graph_alloc();
+  AVFilterGraph* ptr = avfilter_graph_alloc();
   TORCH_CHECK(ptr, "Failed to allocate resouce.");
   ptr->nb_threads = 1;
   return ptr;
@@ -32,7 +31,7 @@ std::string get_audio_src_args(
       time_base.num,
       time_base.den,
       sample_rate,
-      FFMPEG av_get_sample_fmt_name(format),
+      av_get_sample_fmt_name(format),
       channel_layout);
   return std::string(args);
 }
@@ -51,7 +50,7 @@ std::string get_video_src_args(
       "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d",
       width,
       height,
-      FFMPEG av_get_pix_fmt_name(format),
+      av_get_pix_fmt_name(format),
       time_base.num,
       time_base.den,
       frame_rate.num,
@@ -69,7 +68,7 @@ void FilterGraph::add_audio_src(
     int sample_rate,
     uint64_t channel_layout) {
   add_src(
-      FFMPEG avfilter_get_by_name("abuffer"),
+      avfilter_get_by_name("abuffer"),
       get_audio_src_args(format, time_base, sample_rate, channel_layout));
 }
 
@@ -81,13 +80,13 @@ void FilterGraph::add_video_src(
     int height,
     AVRational sample_aspect_ratio) {
   add_src(
-      FFMPEG avfilter_get_by_name("buffer"),
+      avfilter_get_by_name("buffer"),
       get_video_src_args(
           format, time_base, frame_rate, width, height, sample_aspect_ratio));
 }
 
 void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
-  int ret = FFMPEG avfilter_graph_create_filter(
+  int ret = avfilter_graph_create_filter(
       &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph);
   TORCH_CHECK(
       ret >= 0,
@@ -96,11 +95,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) {
 }
 
 void FilterGraph::add_audio_sink() {
-  add_sink(FFMPEG avfilter_get_by_name("abuffersink"));
+  add_sink(avfilter_get_by_name("abuffersink"));
 }
 
 void FilterGraph::add_video_sink() {
-  add_sink(FFMPEG avfilter_get_by_name("buffersink"));
+  add_sink(avfilter_get_by_name("buffersink"));
 }
 
 void FilterGraph::add_sink(const AVFilter* buffersink) {
@@ -114,7 +113,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) {
   // According to the other example
   // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html
   // `abuffersink` should not take options, and this resolved issue.
-  int ret = FFMPEG avfilter_graph_create_filter(
+  int ret = avfilter_graph_create_filter(
       &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph);
   TORCH_CHECK(ret >= 0, "Failed to create output filter.");
 }
@@ -131,15 +130,15 @@ class InOuts {
 
  public:
   InOuts(const char* name, AVFilterContext* pCtx) {
-    p = FFMPEG avfilter_inout_alloc();
+    p = avfilter_inout_alloc();
     TORCH_CHECK(p, "Failed to allocate AVFilterInOut.");
-    p->name = FFMPEG av_strdup(name);
+    p->name = av_strdup(name);
     p->filter_ctx = pCtx;
     p->pad_idx = 0;
     p->next = nullptr;
   }
   ~InOuts() {
-    FFMPEG avfilter_inout_free(&p);
+    avfilter_inout_free(&p);
   }
   operator AVFilterInOut**() {
     return &p;
@@ -156,7 +155,7 @@ void FilterGraph::add_process(const std::string& filter_description) {
   // If you are debugging this part of the code, you might get confused.
   InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx};
 
-  int ret = FFMPEG avfilter_graph_parse_ptr(
+  int ret = avfilter_graph_parse_ptr(
       graph, filter_description.c_str(), out, in, nullptr);
 
   TORCH_CHECK(
@@ -167,11 +166,11 @@ void FilterGraph::add_process(const std::string& filter_description) {
 
 void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) {
   buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx;
-  int ret = FFMPEG avfilter_graph_config(graph, nullptr);
+  int ret = avfilter_graph_config(graph, nullptr);
   TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret));
-  // char* desc = FFMPEG avfilter_graph_dump(graph, NULL);
+  // char* desc = avfilter_graph_dump(graph, NULL);
   // std::cerr << "Filter created:\n" << desc << std::endl;
-  // FFMPEG av_free(static_cast<void*>(desc));
+  // av_free(static_cast<void*>(desc));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -191,8 +190,7 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
       ret.num_channels = l->ch_layout.nb_channels;
 #else
       // Before FFmpeg 5.1
-      ret.num_channels =
-          FFMPEG av_get_channel_layout_nb_channels(l->channel_layout);
+      ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout);
 #endif
       break;
     }
@@ -215,12 +213,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const {
 // Streaming process
 //////////////////////////////////////////////////////////////////////////////
 int FilterGraph::add_frame(AVFrame* pInputFrame) {
-  return FFMPEG av_buffersrc_add_frame_flags(
+  return av_buffersrc_add_frame_flags(
       buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF);
 }
 
 int FilterGraph::get_frame(AVFrame* pOutputFrame) {
-  return FFMPEG av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
+  return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }
 
 } // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp
index 5c84f3dd09..a1d7f3c7a0 100644
--- a/torchaudio/csrc/ffmpeg/hw_context.cpp
+++ b/torchaudio/csrc/ffmpeg/hw_context.cpp
@@ -1,5 +1,4 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
 namespace {
@@ -16,7 +15,7 @@ AVBufferRef* get_cuda_context(int index) {
   }
   if (CUDA_CONTEXT_CACHE.count(index) == 0) {
     AVBufferRef* p = nullptr;
-    int ret = FFMPEG av_hwdevice_ctx_create(
+    int ret = av_hwdevice_ctx_create(
         &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0);
     TORCH_CHECK(
         ret >= 0,
diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
index 5fcb9f6df8..95db01fcec 100644
--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -2,7 +2,6 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
 namespace {
@@ -10,15 +9,15 @@ namespace {
 std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
   std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> ret;
 
-#define add_version(NAME)              \
-  {                                    \
-    int ver = FFMPEG NAME##_version(); \
-    ret.emplace(                       \
-        "lib" #NAME,                   \
-        std::make_tuple<>(             \
-            AV_VERSION_MAJOR(ver),     \
-            AV_VERSION_MINOR(ver),     \
-            AV_VERSION_MICRO(ver)));   \
+#define add_version(NAME)            \
+  {                                  \
+    int ver = NAME##_version();      \
+    ret.emplace(                     \
+        "lib" #NAME,                 \
+        std::make_tuple<>(           \
+            AV_VERSION_MAJOR(ver),   \
+            AV_VERSION_MINOR(ver),   \
+            AV_VERSION_MICRO(ver))); \
   }
 
   add_version(avutil);
@@ -35,7 +34,7 @@ std::map<std::string, std::string> get_demuxers(bool req_device) {
   std::map<std::string, std::string> ret;
   const AVInputFormat* fmt = nullptr;
   void* i = nullptr;
-  while ((fmt = FFMPEG av_demuxer_iterate(&i))) {
+  while ((fmt = av_demuxer_iterate(&i))) {
     assert(fmt);
     bool is_device = [&]() {
       const AVClass* avclass = fmt->priv_class;
@@ -52,7 +51,7 @@ std::map<std::string, std::string> get_muxers(bool req_device) {
   std::map<std::string, std::string> ret;
   const AVOutputFormat* fmt = nullptr;
   void* i = nullptr;
-  while ((fmt = FFMPEG av_muxer_iterate(&i))) {
+  while ((fmt = av_muxer_iterate(&i))) {
     assert(fmt);
     bool is_device = [&]() {
       const AVClass* avclass = fmt->priv_class;
@@ -71,10 +70,10 @@ std::map<std::string, std::string> get_codecs(
   const AVCodec* c = nullptr;
   void* i = nullptr;
   std::map<std::string, std::string> ret;
-  while ((c = FFMPEG av_codec_iterate(&i))) {
+  while ((c = av_codec_iterate(&i))) {
     assert(c);
-    if ((req_encoder && FFMPEG av_codec_is_encoder(c)) ||
-        (!req_encoder && FFMPEG av_codec_is_decoder(c))) {
+    if ((req_encoder && av_codec_is_encoder(c)) ||
+        (!req_encoder && av_codec_is_decoder(c))) {
       if (c->type == type && c->name) {
         ret.emplace(c->name, c->long_name ? c->long_name : "");
       }
@@ -87,7 +86,7 @@ std::vector<std::string> get_protocols(bool output) {
   void* opaque = nullptr;
   const char* name = nullptr;
   std::vector<std::string> ret;
-  while ((name = FFMPEG avio_enum_protocols(&opaque, output))) {
+  while ((name = avio_enum_protocols(&opaque, output))) {
     assert(name);
     ret.emplace_back(name);
   }
@@ -95,7 +94,7 @@ std::vector<std::string> get_protocols(bool output) {
 }
 
 std::string get_build_config() {
-  return FFMPEG avcodec_configuration();
+  return avcodec_configuration();
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -188,9 +187,9 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO {
 };
 
 PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
-  m.def("init", []() { FFMPEG avdevice_register_all(); });
-  m.def("get_log_level", []() { return FFMPEG av_log_get_level(); });
-  m.def("set_log_level", [](int level) { FFMPEG av_log_set_level(level); });
+  m.def("init", []() { avdevice_register_all(); });
+  m.def("get_log_level", []() { return av_log_get_level(); });
+  m.def("set_log_level", [](int level) { av_log_set_level(level); });
   m.def("get_versions", &get_versions);
   m.def("get_muxers", []() { return get_muxers(false); });
   m.def("get_demuxers", []() { return get_demuxers(false); });
@@ -246,22 +245,21 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def_property_readonly(
           "media_type",
           [](const OutputStreamInfo& o) -> std::string {
-            return FFMPEG av_get_media_type_string(o.media_type);
+            return av_get_media_type_string(o.media_type);
           })
       .def_property_readonly(
           "format",
           [](const OutputStreamInfo& o) -> std::string {
             switch (o.media_type) {
               case AVMEDIA_TYPE_AUDIO:
-                return FFMPEG av_get_sample_fmt_name(
-                    (AVSampleFormat)(o.format));
+                return av_get_sample_fmt_name((AVSampleFormat)(o.format));
               case AVMEDIA_TYPE_VIDEO:
-                return FFMPEG av_get_pix_fmt_name((AVPixelFormat)(o.format));
+                return av_get_pix_fmt_name((AVPixelFormat)(o.format));
               default:
                 TORCH_INTERNAL_ASSERT(
                     false,
                     "FilterGraph is returning unexpected media type: ",
-                    FFMPEG av_get_media_type_string(o.media_type));
+                    av_get_media_type_string(o.media_type));
             }
           })
       .def_readonly("sample_rate", &OutputStreamInfo::sample_rate)
@@ -285,7 +283,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def_property_readonly(
           "media_type",
           [](const SrcStreamInfo& s) {
-            return FFMPEG av_get_media_type_string(s.media_type);
+            return av_get_media_type_string(s.media_type);
           })
       .def_readonly("codec_name", &SrcStreamInfo::codec_name)
       .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name)
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
index cf126d16a2..406f4e91bf 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
@@ -1,6 +1,5 @@
 #include <torch/torch.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
@@ -429,11 +428,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      FFMPEG av_get_pix_fmt_name(fmt));
+      av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_NV12 == sw_fmt,
       "Expected NV12 format. Found: ",
-      FFMPEG av_get_pix_fmt_name(sw_fmt));
+      av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   auto status = cudaMemcpy2D(
@@ -510,11 +509,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      FFMPEG av_get_pix_fmt_name(fmt));
+      av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_P010 == sw_fmt,
       "Expected P010 format. Found: ",
-      FFMPEG av_get_pix_fmt_name(sw_fmt));
+      av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   auto status = cudaMemcpy2D(
@@ -591,11 +590,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_CUDA == fmt,
       "Expected CUDA frame. Found: ",
-      FFMPEG av_get_pix_fmt_name(fmt));
+      av_get_pix_fmt_name(fmt));
   TORCH_INTERNAL_ASSERT(
       AV_PIX_FMT_YUV444P == sw_fmt,
       "Expected YUV444P format. Found: ",
-      FFMPEG av_get_pix_fmt_name(sw_fmt));
+      av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
   for (int i = 0; i < 3; ++i) {
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
index 883999fa41..bcff81dc3b 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
@@ -1,11 +1,9 @@
 #include <torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
-
 void PacketBuffer::push_packet(AVPacket* packet) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null.");
-  AVPacket* p = FFMPEG av_packet_clone(packet);
+  AVPacket* p = av_packet_clone(packet);
   TORCH_INTERNAL_ASSERT(p, "Failed to clone packet.");
   packets.emplace_back(p);
 }
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
index 4f397d8b49..38440e3e33 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
@@ -2,7 +2,6 @@
 #include <torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/conversion.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/post_process.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
 namespace detail {
@@ -49,7 +48,7 @@ FilterGraphFactory get_video_factory(
     f.add_video_sink();
     f.add_process(filter_desc);
     if (hw_frames_ctx) {
-      f.create_filter(FFMPEG av_buffer_ref(hw_frames_ctx));
+      f.create_filter(av_buffer_ref(hw_frames_ctx));
     } else {
       f.create_filter();
     }
@@ -140,7 +139,7 @@ struct ProcessImpl : public IPostDecodeProcess {
       if (ret >= 0) {
         buffer.push_frame(converter.convert(frame), frame->pts);
       }
-      FFMPEG av_frame_unref(frame);
+      av_frame_unref(frame);
     }
     return ret;
   }
@@ -160,7 +159,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
   TORCH_INTERNAL_ASSERT(
       i.type == AVMEDIA_TYPE_AUDIO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   using B = UnchunkedBuffer;
 
@@ -227,7 +226,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_audio_process(
     }
     default:
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
   }
 }
 
@@ -240,7 +239,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_AUDIO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   using B = ChunkedBuffer;
   B buffer{i.time_base, frames_per_chunk, num_chunks};
@@ -308,7 +307,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_audio_process(
     }
     default:
       TORCH_INTERNAL_ASSERT(
-          false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt));
+          false, "Unexpected audio type:", av_get_sample_fmt_name(fmt));
   }
 }
 
@@ -322,7 +321,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   auto h = i.height;
   auto w = i.width;
@@ -376,9 +375,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_video_process(
     }
     default: {
       TORCH_INTERNAL_ASSERT(
-          false,
-          "Unexpected video format found: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
     }
   }
 }
@@ -396,7 +393,7 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   using B = UnchunkedBuffer;
   switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -419,13 +416,13 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
       TORCH_CHECK(
           false,
           "Unsupported video format found in CUDA HW: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          av_get_pix_fmt_name(fmt));
     }
     default: {
       TORCH_CHECK(
           false,
           "Unexpected video format found in CUDA HW: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          av_get_pix_fmt_name(fmt));
     }
   }
 #endif
@@ -440,7 +437,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   auto h = i.height;
   auto w = i.width;
@@ -494,9 +491,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_video_process(
     }
     default: {
       TORCH_INTERNAL_ASSERT(
-          false,
-          "Unexpected video format found: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt));
     }
   }
 }
@@ -516,7 +511,7 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       i.type == AVMEDIA_TYPE_VIDEO,
       "Unsupported media type found: ",
-      FFMPEG av_get_media_type_string(i.type));
+      av_get_media_type_string(i.type));
 
   using B = ChunkedBuffer;
   switch (auto fmt = (AVPixelFormat)i.format; fmt) {
@@ -545,13 +540,13 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
       TORCH_CHECK(
           false,
           "Unsupported video format found in CUDA HW: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          av_get_pix_fmt_name(fmt));
     }
     default: {
       TORCH_CHECK(
           false,
           "Unexpected video format found in CUDA HW: ",
-          FFMPEG av_get_pix_fmt_name(fmt));
+          av_get_pix_fmt_name(fmt));
     }
   }
 #endif
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
index ffd1ddea38..2213a4018a 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -1,10 +1,10 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 #include <stdexcept>
 #include <string_view>
 
 namespace torchaudio::io {
+
 namespace {
 AVCodecContextPtr alloc_codec_context(
     enum AVCodecID codec_id,
@@ -12,24 +12,24 @@ AVCodecContextPtr alloc_codec_context(
   const AVCodec* codec = [&]() {
     if (decoder_name) {
       const AVCodec* c =
-          FFMPEG avcodec_find_decoder_by_name(decoder_name.value().c_str());
+          avcodec_find_decoder_by_name(decoder_name.value().c_str());
       TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value());
       return c;
     } else {
-      const AVCodec* c = FFMPEG avcodec_find_decoder(codec_id);
-      TORCH_CHECK(c, "Unsupported codec: ", FFMPEG avcodec_get_name(codec_id));
+      const AVCodec* c = avcodec_find_decoder(codec_id);
+      TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id));
       return c;
     }
   }();
 
-  AVCodecContext* codec_ctx = FFMPEG avcodec_alloc_context3(codec);
+  AVCodecContext* codec_ctx = avcodec_alloc_context3(codec);
   TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext.");
   return AVCodecContextPtr(codec_ctx);
 }
 
 const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) {
   for (int i = 0;; ++i) {
-    const AVCodecHWConfig* config = FFMPEG avcodec_get_hw_config(codec, i);
+    const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i);
     if (!config) {
       break;
     }
@@ -82,7 +82,7 @@ enum AVPixelFormat get_hw_format(
 }
 
 AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
-  AVBufferRef* p = FFMPEG av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
+  AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx);
   TORCH_CHECK(
       p,
       "Failed to allocate CUDA frame context from device context at ",
@@ -93,11 +93,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) {
   frames_ctx->width = codec_ctx->width;
   frames_ctx->height = codec_ctx->height;
   frames_ctx->initial_pool_size = 5;
-  int ret = FFMPEG av_hwframe_ctx_init(p);
+  int ret = av_hwframe_ctx_init(p);
   if (ret >= 0) {
     return p;
   }
-  FFMPEG av_buffer_unref(&p);
+  av_buffer_unref(&p);
   TORCH_CHECK(
       false, "Failed to initialize CUDA frame context: ", av_err2string(ret));
 }
@@ -106,7 +106,7 @@ void configure_codec_context(
     AVCodecContext* codec_ctx,
     const AVCodecParameters* params,
     const torch::Device& device) {
-  int ret = FFMPEG avcodec_parameters_to_context(codec_ctx, params);
+  int ret = avcodec_parameters_to_context(codec_ctx, params);
   TORCH_CHECK(
       ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret));
 
@@ -121,8 +121,7 @@ void configure_codec_context(
     // 2. Set pCodecContext->get_format call back function which
     // will retrieve the HW pixel format from opaque pointer.
     codec_ctx->get_format = get_hw_format;
-    codec_ctx->hw_device_ctx =
-        FFMPEG av_buffer_ref(get_cuda_context(device.index()));
+    codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
     TORCH_INTERNAL_ASSERT(
         codec_ctx->hw_device_ctx, "Failed to reference HW device context.");
 #endif
@@ -135,16 +134,16 @@ void open_codec(
   AVDictionary* opts = get_option_dict(decoder_option);
 
   // Default to single thread execution.
-  if (!FFMPEG av_dict_get(opts, "threads", nullptr, 0)) {
-    FFMPEG av_dict_set(&opts, "threads", "1", 0);
+  if (!av_dict_get(opts, "threads", nullptr, 0)) {
+    av_dict_set(&opts, "threads", "1", 0);
   }
 
   if (!codec_ctx->channel_layout) {
     codec_ctx->channel_layout =
-        FFMPEG av_get_default_channel_layout(codec_ctx->channels);
+        av_get_default_channel_layout(codec_ctx->channels);
   }
 
-  int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
+  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts);
   clean_up_dict(opts);
   TORCH_CHECK(
       ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret));
@@ -259,8 +258,8 @@ void StreamProcessor::remove_stream(KeyType key) {
 
 void StreamProcessor::set_discard_timestamp(int64_t timestamp) {
   TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative.");
-  discard_before_pts = FFMPEG av_rescale_q(
-      timestamp, FFMPEG av_get_time_base_q(), stream_time_base);
+  discard_before_pts =
+      av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base);
 }
 
 void StreamProcessor::set_decoder(
@@ -306,9 +305,9 @@ int StreamProcessor::process_packet(AVPacket* packet) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       is_decoder_set(),
       "Decoder must have been set prior to calling this function.");
-  int ret = FFMPEG avcodec_send_packet(codec_ctx, packet);
+  int ret = avcodec_send_packet(codec_ctx, packet);
   while (ret >= 0) {
-    ret = FFMPEG avcodec_receive_frame(codec_ctx, frame);
+    ret = avcodec_receive_frame(codec_ctx, frame);
     //  AVERROR(EAGAIN) means that new input data is required to return new
     //  output.
     if (ret == AVERROR(EAGAIN))
@@ -355,7 +354,7 @@ int StreamProcessor::process_packet(AVPacket* packet) {
     }
 
     // else we can just unref the frame and continue
-    FFMPEG av_frame_unref(frame);
+    av_frame_unref(frame);
   }
   return ret;
 }
@@ -364,7 +363,7 @@ void StreamProcessor::flush() {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       is_decoder_set(),
       "Decoder must have been set prior to calling this function.");
-  FFMPEG avcodec_flush_buffers(codec_ctx);
+  avcodec_flush_buffers(codec_ctx);
   for (auto& ite : post_processes) {
     ite.second->flush();
   }
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
index 518bc02131..b8e9d7a9bf 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -1,15 +1,10 @@
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 #include <chrono>
 #include <sstream>
 #include <stdexcept>
 #include <thread>
 
-extern "C" {
-#include <libavutil/rational.h>
-}
-
 namespace torchaudio::io {
 
 using KeyType = StreamProcessor::KeyType;
@@ -23,7 +18,7 @@ AVFormatContext* get_input_format_context(
     const c10::optional<std::string>& format,
     const c10::optional<OptionDict>& option,
     AVIOContext* io_ctx) {
-  AVFormatContext* p = FFMPEG avformat_alloc_context();
+  AVFormatContext* p = avformat_alloc_context();
   TORCH_CHECK(p, "Failed to allocate AVFormatContext.");
   if (io_ctx) {
     p->pb = io_ctx;
@@ -33,7 +28,7 @@ AVFormatContext* get_input_format_context(
     if (format.has_value()) {
       std::string format_str = format.value();
       AVFORMAT_CONST AVInputFormat* pInput =
-          FFMPEG av_find_input_format(format_str.c_str());
+          av_find_input_format(format_str.c_str());
       TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\"");
       return pInput;
     }
@@ -41,7 +36,7 @@ AVFormatContext* get_input_format_context(
   }();
 
   AVDictionary* opt = get_option_dict(option);
-  int ret = FFMPEG avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
+  int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt);
   clean_up_dict(opt);
 
   TORCH_CHECK(
@@ -57,7 +52,7 @@ AVFormatContext* get_input_format_context(
 
 StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) {
   C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader");
-  int ret = FFMPEG avformat_find_stream_info(format_ctx, nullptr);
+  int ret = avformat_find_stream_info(format_ctx, nullptr);
   TORCH_CHECK(
       ret >= 0, "Failed to find stream information: ", av_err2string(ret));
 
@@ -114,7 +109,7 @@ void validate_src_stream_type(
       "Stream ",
       i,
       " is not ",
-      FFMPEG av_get_media_type_string(type),
+      av_get_media_type_string(type),
       " stream.");
 }
 
@@ -129,7 +124,7 @@ namespace {
 OptionDict parse_metadata(const AVDictionary* metadata) {
   AVDictionaryEntry* tag = nullptr;
   OptionDict ret;
-  while ((tag = FFMPEG av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
+  while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
     ret.emplace(std::string(tag->key), std::string(tag->value));
   }
   return ret;
@@ -152,8 +147,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
   ret.num_frames = stream->nb_frames;
   ret.bits_per_sample = codecpar->bits_per_raw_sample;
   ret.metadata = parse_metadata(stream->metadata);
-  const AVCodecDescriptor* desc =
-      FFMPEG avcodec_descriptor_get(codecpar->codec_id);
+  const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id);
   if (desc) {
     ret.codec_name = desc->name;
     ret.codec_long_name = desc->long_name;
@@ -163,7 +157,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
     case AVMEDIA_TYPE_AUDIO: {
       AVSampleFormat smp_fmt = static_cast<AVSampleFormat>(codecpar->format);
       if (smp_fmt != AV_SAMPLE_FMT_NONE) {
-        ret.fmt_name = FFMPEG av_get_sample_fmt_name(smp_fmt);
+        ret.fmt_name = av_get_sample_fmt_name(smp_fmt);
       }
       ret.sample_rate = static_cast<double>(codecpar->sample_rate);
       ret.num_channels = codecpar->channels;
@@ -172,7 +166,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
     case AVMEDIA_TYPE_VIDEO: {
       AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(codecpar->format);
       if (pix_fmt != AV_PIX_FMT_NONE) {
-        ret.fmt_name = FFMPEG av_get_pix_fmt_name(pix_fmt);
+        ret.fmt_name = av_get_pix_fmt_name(pix_fmt);
       }
       ret.width = codecpar->width;
       ret.height = codecpar->height;
@@ -186,7 +180,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const {
 
 namespace {
 AVCodecParameters* get_codecpar() {
-  AVCodecParameters* ptr = FFMPEG avcodec_parameters_alloc();
+  AVCodecParameters* ptr = avcodec_parameters_alloc();
   TORCH_CHECK(ptr, "Failed to allocate resource.");
   return ptr;
 }
@@ -197,7 +191,7 @@ StreamParams StreamReader::get_src_stream_params(int i) {
   AVStream* stream = format_ctx->streams[i];
 
   AVCodecParametersPtr codec_params(get_codecpar());
-  int ret = FFMPEG avcodec_parameters_copy(codec_params, stream->codecpar);
+  int ret = avcodec_parameters_copy(codec_params, stream->codecpar);
   TORCH_CHECK(
       ret >= 0,
       "Failed to copy the stream's codec parameters. (",
@@ -239,12 +233,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const {
 }
 
 int64_t StreamReader::find_best_audio_stream() const {
-  return FFMPEG av_find_best_stream(
+  return av_find_best_stream(
       format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0);
 }
 
 int64_t StreamReader::find_best_video_stream() const {
-  return FFMPEG av_find_best_stream(
+  return av_find_best_stream(
       format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0);
 }
 
@@ -294,7 +288,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) {
       TORCH_CHECK(false, "Invalid mode value: ", mode);
   }
 
-  int ret = FFMPEG av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);
+  int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag);
 
   if (ret < 0) {
     seek_timestamp = 0;
@@ -407,12 +401,12 @@ void StreamReader::add_stream(
       case AVMEDIA_TYPE_AUDIO:
         return AVRational{0, 1};
       case AVMEDIA_TYPE_VIDEO:
-        return FFMPEG av_guess_frame_rate(format_ctx, stream, nullptr);
+        return av_guess_frame_rate(format_ctx, stream, nullptr);
       default:
         TORCH_INTERNAL_ASSERT(
             false,
             "Unexpected media type is given: ",
-            FFMPEG av_get_media_type_string(media_type));
+            av_get_media_type_string(media_type));
     }
   }();
   int key = processors[i]->add_stream(
@@ -451,7 +445,7 @@ void StreamReader::remove_stream(int64_t i) {
 // 1: It's done, caller should stop calling
 // <0: Some error happened
 int StreamReader::process_packet() {
-  int ret = FFMPEG av_read_frame(format_ctx, packet);
+  int ret = av_read_frame(format_ctx, packet);
   if (ret == AVERROR_EOF) {
     ret = drain();
     return (ret < 0) ? ret : 1;
@@ -582,13 +576,12 @@ AVIOContext* get_io_context(
     int buffer_size,
     int (*read_packet)(void* opaque, uint8_t* buf, int buf_size),
     int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer =
-      static_cast<unsigned char*>(FFMPEG av_malloc(buffer_size));
+  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
   TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = FFMPEG avio_alloc_context(
+  AVIOContext* io_ctx = avio_alloc_context(
       buffer, buffer_size, 0, opaque, read_packet, nullptr, seek);
   if (!io_ctx) {
-    FFMPEG av_freep(&buffer);
+    av_freep(&buffer);
     TORCH_CHECK(false, "Failed to allocate AVIOContext.");
   }
   return io_ctx;
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
index 3f9a153004..c13c3cfcb9 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
@@ -1,12 +1,7 @@
 #include <torchaudio/csrc/ffmpeg/hw_context.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encode_process.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 #include <cmath>
 
-extern "C" {
-#include <libavutil/rational.h>
-}
-
 namespace torchaudio::io {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -61,7 +56,7 @@ void EncodeProcess::process_frame(AVFrame* src) {
     if (ret >= 0) {
       encoder.encode(dst_frame);
     }
-    FFMPEG av_frame_unref(dst_frame);
+    av_frame_unref(dst_frame);
   }
 }
 
@@ -76,8 +71,8 @@ void EncodeProcess::flush() {
 namespace {
 
 enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
-  auto fmt = FFMPEG av_get_sample_fmt(src.c_str());
-  if (fmt != AV_SAMPLE_FMT_NONE && !FFMPEG av_sample_fmt_is_planar(fmt)) {
+  auto fmt = av_get_sample_fmt(src.c_str());
+  if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) {
     return fmt;
   }
   TORCH_CHECK(
@@ -94,7 +89,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
               AV_SAMPLE_FMT_S64,
               AV_SAMPLE_FMT_FLT,
               AV_SAMPLE_FMT_DBL}) {
-          ret.emplace_back(FFMPEG av_get_sample_fmt_name(fmt));
+          ret.emplace_back(av_get_sample_fmt_name(fmt));
         }
         return c10::Join(", ", ret);
       }(),
@@ -102,7 +97,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) {
 }
 
 enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
-  AVPixelFormat fmt = FFMPEG av_get_pix_fmt(src.c_str());
+  AVPixelFormat fmt = av_get_pix_fmt(src.c_str());
   switch (fmt) {
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_RGB24:
@@ -123,7 +118,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) {
               AV_PIX_FMT_RGB24,
               AV_PIX_FMT_BGR24,
               AV_PIX_FMT_YUV444P}) {
-          ret.emplace_back(FFMPEG av_get_pix_fmt_name(fmt));
+          ret.emplace_back(av_get_pix_fmt_name(fmt));
         }
         return c10::Join(", ", ret);
       }(),
@@ -137,21 +132,18 @@ const AVCodec* get_codec(
     AVCodecID default_codec,
     const c10::optional<std::string>& encoder) {
   if (encoder) {
-    const AVCodec* c =
-        FFMPEG avcodec_find_encoder_by_name(encoder.value().c_str());
+    const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str());
     TORCH_CHECK(c, "Unexpected codec: ", encoder.value());
     return c;
   }
-  const AVCodec* c = FFMPEG avcodec_find_encoder(default_codec);
+  const AVCodec* c = avcodec_find_encoder(default_codec);
   TORCH_CHECK(
-      c,
-      "Encoder not found for codec: ",
-      FFMPEG avcodec_get_name(default_codec));
+      c, "Encoder not found for codec: ", avcodec_get_name(default_codec));
   return c;
 }
 
 AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) {
-  AVCodecContext* ctx = FFMPEG avcodec_alloc_context3(codec);
+  AVCodecContext* ctx = avcodec_alloc_context3(codec);
   TORCH_CHECK(ctx, "Failed to allocate CodecContext.");
 
   if (flags & AVFMT_GLOBALHEADER) {
@@ -177,25 +169,25 @@ void open_codec(
   // while "libopus" refers to the one depends on libopusenc
   // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251
   if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) {
-    if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!av_dict_get(opt, "strict", nullptr, 0)) {
       TORCH_WARN_ONCE(
           "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ",
           "If this is not desired, please provide \"strict\" encoder option ",
           "with desired value.");
-      FFMPEG av_dict_set(&opt, "strict", "experimental", 0);
+      av_dict_set(&opt, "strict", "experimental", 0);
     }
   }
   if (std::strcmp(codec_ctx->codec->name, "opus") == 0) {
-    if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) {
+    if (!av_dict_get(opt, "strict", nullptr, 0)) {
       TORCH_WARN_ONCE(
           "\"opus\" encoder is selected. Enabling '-strict experimental'. ",
           "If this is not desired, please provide \"strict\" encoder option ",
           "with desired value.");
-      FFMPEG av_dict_set(&opt, "strict", "experimental", 0);
+      av_dict_set(&opt, "strict", "experimental", 0);
     }
   }
 
-  int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
+  int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt);
   clean_up_dict(opt);
   TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")");
 }
@@ -222,7 +214,7 @@ bool supported_sample_fmt(
 std::string get_supported_formats(const AVSampleFormat* sample_fmts) {
   std::vector<std::string> ret;
   while (*sample_fmts != AV_SAMPLE_FMT_NONE) {
-    ret.emplace_back(FFMPEG av_get_sample_fmt_name(*sample_fmts));
+    ret.emplace_back(av_get_sample_fmt_name(*sample_fmts));
     ++sample_fmts;
   }
   return c10::Join(", ", ret);
@@ -234,7 +226,7 @@ AVSampleFormat get_enc_fmt(
     const AVCodec* codec) {
   if (encoder_format) {
     auto& enc_fmt_val = encoder_format.value();
-    auto fmt = FFMPEG av_get_sample_fmt(enc_fmt_val.c_str());
+    auto fmt = av_get_sample_fmt(enc_fmt_val.c_str());
     TORCH_CHECK(
         fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val);
     TORCH_CHECK(
@@ -321,8 +313,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) {
   std::vector<std::string> names;
   while (*channel_layouts) {
     std::stringstream ss;
-    ss << FFMPEG av_get_channel_layout_nb_channels(*channel_layouts);
-    ss << " (" << FFMPEG av_get_channel_name(*channel_layouts) << ")";
+    ss << av_get_channel_layout_nb_channels(*channel_layouts);
+    ss << " (" << av_get_channel_name(*channel_layouts) << ")";
     names.emplace_back(ss.str());
     ++channel_layouts;
   }
@@ -339,10 +331,10 @@ uint64_t get_channel_layout(
     TORCH_CHECK(
         val > 0, "The number of channels must be greater than 0. Found: ", val);
     if (!codec->channel_layouts) {
-      return static_cast<uint64_t>(FFMPEG av_get_default_channel_layout(val));
+      return static_cast<uint64_t>(av_get_default_channel_layout(val));
     }
     for (const uint64_t* it = codec->channel_layouts; *it; ++it) {
-      if (FFMPEG av_get_channel_layout_nb_channels(*it) == val) {
+      if (av_get_channel_layout_nb_channels(*it) == val) {
         return *it;
       }
     }
@@ -379,9 +371,8 @@ void configure_audio_codec_ctx(
     const c10::optional<CodecConfig>& codec_config) {
   codec_ctx->sample_fmt = format;
   codec_ctx->sample_rate = sample_rate;
-  codec_ctx->time_base = av_inv_q(FFMPEG av_d2q(sample_rate, 1 << 24));
-  codec_ctx->channels =
-      FFMPEG av_get_channel_layout_nb_channels(channel_layout);
+  codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24));
+  codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout);
   codec_ctx->channel_layout = channel_layout;
 
   // Set optional stuff
@@ -420,7 +411,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) {
 std::string get_supported_formats(const AVPixelFormat* pix_fmts) {
   std::vector<std::string> ret;
   while (*pix_fmts != AV_PIX_FMT_NONE) {
-    ret.emplace_back(FFMPEG av_get_pix_fmt_name(*pix_fmts));
+    ret.emplace_back(av_get_pix_fmt_name(*pix_fmts));
     ++pix_fmts;
   }
   return c10::Join(", ", ret);
@@ -432,7 +423,7 @@ AVPixelFormat get_enc_fmt(
     const AVCodec* codec) {
   if (encoder_format) {
     const auto& val = encoder_format.value();
-    auto fmt = FFMPEG av_get_pix_fmt(val.c_str());
+    auto fmt = av_get_pix_fmt(val.c_str());
     TORCH_CHECK(
         supported_pix_fmt(fmt, codec->pix_fmts),
         codec->name,
@@ -470,7 +461,7 @@ AVRational get_enc_rate(
         std::isfinite(enc_rate) && enc_rate > 0,
         "Encoder sample rate must be positive and fininte. Found: ",
         enc_rate);
-    AVRational rate = FFMPEG av_d2q(enc_rate, 1 << 24);
+    AVRational rate = av_d2q(enc_rate, 1 << 24);
     TORCH_CHECK(
         supported_frame_rate(rate, codec->supported_framerates),
         codec->name,
@@ -554,14 +545,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
   // context to AVCodecContext. But this way, it will be deallocated
   // automatically at the time AVCodecContext is freed, so we do that.
 
-  ctx->hw_device_ctx = FFMPEG av_buffer_ref(get_cuda_context(device.index()));
+  ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index()));
   TORCH_INTERNAL_ASSERT(
       ctx->hw_device_ctx, "Failed to reference HW device context.");
 
   ctx->sw_pix_fmt = ctx->pix_fmt;
   ctx->pix_fmt = AV_PIX_FMT_CUDA;
 
-  ctx->hw_frames_ctx = FFMPEG av_hwframe_ctx_alloc(ctx->hw_device_ctx);
+  ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx);
   TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context.");
 
   auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data);
@@ -571,7 +562,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
   frames_ctx->height = ctx->height;
   frames_ctx->initial_pool_size = 5;
 
-  int ret = FFMPEG av_hwframe_ctx_init(ctx->hw_frames_ctx);
+  int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx);
   TORCH_CHECK(
       ret >= 0,
       "Failed to initialize CUDA frame context: ",
@@ -583,11 +574,11 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) {
 ////////////////////////////////////////////////////////////////////////////////
 
 AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) {
-  AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr);
+  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
   TORCH_CHECK(stream, "Failed to allocate stream.");
 
   stream->time_base = codec_ctx->time_base;
-  int ret = FFMPEG avcodec_parameters_from_context(stream->codecpar, codec_ctx);
+  int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx);
   TORCH_CHECK(
       ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret));
   return stream;
@@ -614,7 +605,7 @@ FilterGraph get_audio_filter_graph(
     if (filter_desc || src_fmt != enc_fmt ||
         src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) {
       std::stringstream ss;
-      ss << "aformat=sample_fmts=" << FFMPEG av_get_sample_fmt_name(enc_fmt)
+      ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt)
          << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x"
          << std::hex << enc_ch_layout;
       parts.push_back(ss.str());
@@ -665,7 +656,7 @@ FilterGraph get_video_filter_graph(
     }
     if (filter_desc || src_fmt != enc_fmt) {
       std::stringstream ss;
-      ss << "format=" << FFMPEG av_get_pix_fmt_name(enc_fmt);
+      ss << "format=" << av_get_pix_fmt_name(enc_fmt);
       parts.emplace_back(ss.str());
     }
     if (filter_desc ||
@@ -709,7 +700,7 @@ AVFramePtr get_audio_frame(
   frame->channel_layout = channel_layout;
   frame->sample_rate = sample_rate;
   frame->nb_samples = nb_samples;
-  int ret = FFMPEG av_frame_get_buffer(frame, 0);
+  int ret = av_frame_get_buffer(frame, 0);
   TORCH_CHECK(
       ret >= 0, "Error allocating the source audio frame:", av_err2string(ret));
 
@@ -725,7 +716,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) {
   frame->format = src_fmt;
   frame->width = width;
   frame->height = height;
-  int ret = FFMPEG av_frame_get_buffer(frame, 0);
+  int ret = av_frame_get_buffer(frame, 0);
   TORCH_CHECK(
       ret >= 0, "Error allocating a video buffer :", av_err2string(ret));
 
@@ -770,10 +761,10 @@ EncodeProcess get_audio_encode_process(
   // case, restrictions on the format to support tensor inputs do not apply, and
   // so we directly get the format via FFmpeg.
   const AVSampleFormat src_fmt = (disable_converter)
-      ? FFMPEG av_get_sample_fmt(format.c_str())
+      ? av_get_sample_fmt(format.c_str())
       : get_src_sample_fmt(format);
-  const auto src_ch_layout = static_cast<uint64_t>(
-      FFMPEG av_get_default_channel_layout(src_num_channels));
+  const auto src_ch_layout =
+      static_cast<uint64_t>(av_get_default_channel_layout(src_num_channels));
 
   // 2. Fetch codec from default or override
   TORCH_CHECK(
@@ -793,7 +784,7 @@ EncodeProcess get_audio_encode_process(
       // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277
       // This is the case for at least until FFmpeg 6.0, so it will be
       // like this for a while.
-      return static_cast<uint64_t>(FFMPEG av_get_default_channel_layout(2));
+      return static_cast<uint64_t>(av_get_default_channel_layout(2));
     }
     return get_channel_layout(src_ch_layout, encoder_num_channels, codec);
   }();
@@ -881,9 +872,9 @@ EncodeProcess get_video_encode_process(
   // case, restrictions on the format to support tensor inputs do not apply, and
   // so we directly get the format via FFmpeg.
   const AVPixelFormat src_fmt = (disable_converter)
-      ? FFMPEG av_get_pix_fmt(format.c_str())
+      ? av_get_pix_fmt(format.c_str())
       : get_src_pix_fmt(format);
-  const AVRational src_rate = FFMPEG av_d2q(frame_rate, 1 << 24);
+  const AVRational src_rate = av_d2q(frame_rate, 1 << 24);
 
   // 2. Fetch codec from default or override
   TORCH_CHECK(
@@ -950,8 +941,7 @@ EncodeProcess get_video_encode_process(
   AVFramePtr src_frame = [&]() {
     if (codec_ctx->hw_frames_ctx) {
       AVFramePtr frame{alloc_avframe()};
-      int ret =
-          FFMPEG av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
+      int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0);
       TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret));
       frame->nb_samples = 1;
       frame->pts = 0;
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
index 7552484f2a..3d2e501535 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp
@@ -1,5 +1,4 @@
 #include <torchaudio/csrc/ffmpeg/stream_writer/encoder.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
 
@@ -14,10 +13,10 @@ Encoder::Encoder(
 ///
 /// @param frame Frame data to encode
 void Encoder::encode(AVFrame* frame) {
-  int ret = FFMPEG avcodec_send_frame(codec_ctx, frame);
+  int ret = avcodec_send_frame(codec_ctx, frame);
   TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ").");
   while (ret >= 0) {
-    ret = FFMPEG avcodec_receive_packet(codec_ctx, packet);
+    ret = avcodec_receive_packet(codec_ctx, packet);
     if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
       if (ret == AVERROR_EOF) {
         // Note:
@@ -32,7 +31,7 @@ void Encoder::encode(AVFrame* frame) {
         // An alternative is to use `av_write_frame` functoin, but in that case
         // client code is responsible for ordering packets, which makes it
         // complicated to use StreamWriter
-        ret = FFMPEG av_interleaved_write_frame(format_ctx, nullptr);
+        ret = av_interleaved_write_frame(format_ctx, nullptr);
         TORCH_CHECK(
             ret >= 0, "Failed to flush packet (", av_err2string(ret), ").");
       }
@@ -52,11 +51,10 @@ void Encoder::encode(AVFrame* frame) {
       // This has to be set before av_packet_rescale_ts bellow.
       packet->duration = 1;
     }
-    FFMPEG av_packet_rescale_ts(
-        packet, codec_ctx->time_base, stream->time_base);
+    av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base);
     packet->stream_index = stream->index;
 
-    ret = FFMPEG av_interleaved_write_frame(format_ctx, packet);
+    ret = av_interleaved_write_frame(format_ctx, packet);
     TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ").");
   }
 }
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
index 45872a6af5..0701c5a596 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp
@@ -1,14 +1,13 @@
 #include <torchaudio/csrc/ffmpeg/stream_writer/packet_writer.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 namespace torchaudio::io {
 namespace {
 AVStream* add_stream(
     AVFormatContext* format_ctx,
     const StreamParams& stream_params) {
-  AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr);
-  int ret = FFMPEG avcodec_parameters_copy(
-      stream->codecpar, stream_params.codec_params);
+  AVStream* stream = avformat_new_stream(format_ctx, nullptr);
+  int ret =
+      avcodec_parameters_copy(stream->codecpar, stream_params.codec_params);
   TORCH_CHECK(
       ret >= 0,
       "Failed to copy the stream's codec parameters. (",
@@ -27,12 +26,11 @@ PacketWriter::PacketWriter(
 
 void PacketWriter::write_packet(const AVPacketPtr& packet) {
   AVPacket dst_packet;
-  int ret = FFMPEG av_packet_ref(&dst_packet, packet);
+  int ret = av_packet_ref(&dst_packet, packet);
   TORCH_CHECK(ret >= 0, "Failed to copy packet.");
-  FFMPEG av_packet_rescale_ts(
-      &dst_packet, original_time_base, stream->time_base);
+  av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base);
   dst_packet.stream_index = stream->index;
-  ret = FFMPEG av_interleaved_write_frame(format_ctx, &dst_packet);
+  ret = av_interleaved_write_frame(format_ctx, &dst_packet);
   TORCH_CHECK(ret >= 0, "Failed to write packet to destination.");
 }
 } // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
index 4252cd7072..df51d92355 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -1,11 +1,11 @@
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
 #endif
 
-namespace torchaudio::io {
+namespace torchaudio {
+namespace io {
 namespace {
 
 AVFormatContext* get_output_format_context(
@@ -19,7 +19,7 @@ AVFormatContext* get_output_format_context(
   }
 
   AVFormatContext* p = nullptr;
-  int ret = FFMPEG avformat_alloc_output_context2(
+  int ret = avformat_alloc_output_context2(
       &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str());
   TORCH_CHECK(
       ret >= 0,
@@ -208,14 +208,14 @@ void StreamWriter::add_video_frame_stream(
 }
 
 void StreamWriter::set_metadata(const OptionDict& metadata) {
-  FFMPEG av_dict_free(&format_ctx->metadata);
+  av_dict_free(&format_ctx->metadata);
   for (auto const& [key, value] : metadata) {
-    FFMPEG av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
+    av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0);
   }
 }
 
 void StreamWriter::dump_format(int64_t i) {
-  FFMPEG av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
+  av_dump_format(format_ctx, (int)i, format_ctx->url, 1);
 }
 
 void StreamWriter::open(const c10::optional<OptionDict>& option) {
@@ -231,10 +231,10 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
   AVDictionary* opt = get_option_dict(option);
   if (!(fmt->flags & AVFMT_NOFILE) &&
       !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
-    ret = FFMPEG avio_open2(
+    ret = avio_open2(
         &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt);
     if (ret < 0) {
-      FFMPEG av_dict_free(&opt);
+      av_dict_free(&opt);
       TORCH_CHECK(
           false,
           "Failed to open dst: ",
@@ -245,7 +245,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
     }
   }
 
-  ret = FFMPEG avformat_write_header(format_ctx, &opt);
+  ret = avformat_write_header(format_ctx, &opt);
   clean_up_dict(opt);
   TORCH_CHECK(
       ret >= 0,
@@ -258,7 +258,7 @@ void StreamWriter::open(const c10::optional<OptionDict>& option) {
 }
 
 void StreamWriter::close() {
-  int ret = FFMPEG av_write_trailer(format_ctx);
+  int ret = av_write_trailer(format_ctx);
   if (ret < 0) {
     LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ").";
   }
@@ -269,7 +269,7 @@ void StreamWriter::close() {
   if (!(fmt->flags & AVFMT_NOFILE) &&
       !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
     // avio_closep can be only applied to AVIOContext opened by avio_open
-    FFMPEG avio_closep(&(format_ctx->pb));
+    avio_closep(&(format_ctx->pb));
   }
   is_open = false;
 }
@@ -355,13 +355,12 @@ AVIOContext* get_io_context(
     int buffer_size,
     int (*write_packet)(void* opaque, uint8_t* buf, int buf_size),
     int64_t (*seek)(void* opaque, int64_t offset, int whence)) {
-  unsigned char* buffer =
-      static_cast<unsigned char*>(FFMPEG av_malloc(buffer_size));
+  unsigned char* buffer = static_cast<unsigned char*>(av_malloc(buffer_size));
   TORCH_CHECK(buffer, "Failed to allocate buffer.");
-  AVIOContext* io_ctx = FFMPEG avio_alloc_context(
+  AVIOContext* io_ctx = avio_alloc_context(
       buffer, buffer_size, 1, opaque, nullptr, write_packet, seek);
   if (!io_ctx) {
-    FFMPEG av_freep(&buffer);
+    av_freep(&buffer);
     TORCH_CHECK(false, "Failed to allocate AVIOContext.");
   }
   return io_ctx;
@@ -385,4 +384,5 @@ StreamWriterCustomIO::StreamWriterCustomIO(
     : CustomOutput(opaque, buffer_size, write_packet, seek),
       StreamWriter(io_ctx, format) {}
 
-} // namespace torchaudio::io
+} // namespace io
+} // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
index 1478d38d5a..e9350f0479 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp
@@ -1,11 +1,11 @@
 #include <torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
 
 #ifdef USE_CUDA
 #include <c10/cuda/CUDAStream.h>
 #endif
 
 namespace torchaudio::io {
+
 namespace {
 
 using InitFunc = TensorConverter::InitFunc;
@@ -41,8 +41,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334
-  if (!FFMPEG av_frame_is_writable(buffer)) {
-    int ret = FFMPEG av_frame_make_writable(buffer);
+  if (!av_frame_is_writable(buffer)) {
+    int ret = av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -145,8 +145,8 @@ void write_interlaced_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!FFMPEG av_frame_is_writable(buffer)) {
-    int ret = FFMPEG av_frame_make_writable(buffer);
+  if (!av_frame_is_writable(buffer)) {
+    int ret = av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -187,7 +187,7 @@ void write_planar_video(
     AVFrame* buffer,
     int num_planes) {
   const auto num_colors =
-      FFMPEG av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
+      av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components;
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors);
@@ -195,8 +195,8 @@ void write_planar_video(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width);
 
   // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472
-  if (!FFMPEG av_frame_is_writable(buffer)) {
-    int ret = FFMPEG av_frame_make_writable(buffer);
+  if (!av_frame_is_writable(buffer)) {
+    int ret = av_frame_make_writable(buffer);
     TORCH_INTERNAL_ASSERT(
         ret >= 0, "Failed to make frame writable: ", av_err2string(ret));
   }
@@ -308,7 +308,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
         TORCH_CHECK(
             false,
             "Unexpected pixel format for CUDA: ",
-            FFMPEG av_get_pix_fmt_name(sw_pix_fmt));
+            av_get_pix_fmt_name(sw_pix_fmt));
     }
   }
 
@@ -317,7 +317,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
     case AV_PIX_FMT_GRAY8:
     case AV_PIX_FMT_RGB24:
     case AV_PIX_FMT_BGR24: {
-      int channels = FFMPEG av_pix_fmt_desc_get(pix_fmt)->nb_components;
+      int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components;
       InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) {
         validate_video_input(t, f, channels);
         return init_interlaced(t);
@@ -339,9 +339,7 @@ std::pair<InitFunc, ConvertFunc> get_video_func(AVFrame* buffer) {
     }
     default:
       TORCH_CHECK(
-          false,
-          "Unexpected pixel format: ",
-          FFMPEG av_get_pix_fmt_name(pix_fmt));
+          false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt));
   }
 }
 
@@ -385,9 +383,7 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size)
       break;
     default:
       TORCH_INTERNAL_ASSERT(
-          false,
-          "Unsupported media type: ",
-          FFMPEG av_get_media_type_string(type));
+          false, "Unsupported media type: ", av_get_media_type_string(type));
   }
 }
 
diff --git a/torchaudio/csrc/ffmpeg/stub.cpp b/torchaudio/csrc/ffmpeg/stub.cpp
deleted file mode 100644
index 4960b0050e..0000000000
--- a/torchaudio/csrc/ffmpeg/stub.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifdef DLOPEN_FFMPEG
-
-#include <ATen/DynamicLibrary.h>
-#include <c10/util/CallOnce.h>
-#include <torchaudio/csrc/ffmpeg/stub.h>
-
-extern "C" {
-#include <libavcodec/version.h>
-#include <libavdevice/version.h>
-#include <libavfilter/version.h>
-#include <libavformat/version.h>
-#include <libavutil/version.h>
-}
-
-namespace torchaudio::io::detail {
-namespace {
-class StubImpl {
-  at::DynamicLibrary libavutil;
-  at::DynamicLibrary libavcodec;
-  at::DynamicLibrary libavformat;
-  at::DynamicLibrary libavdevice;
-  at::DynamicLibrary libavfilter;
-
- public:
-  // The struct that holds all the function pointers to be used.
-  FFmpegStub stub{};
-
-  StubImpl(
-      const char* util,
-      const char* codec,
-      const char* format,
-      const char* device,
-      const char* filter)
-      : libavutil(util),
-        libavcodec(codec),
-        libavformat(format),
-        libavdevice(device),
-        libavfilter(filter) {
-#define set(X) stub.X = (decltype(FFmpegStub::X))libavutil.sym(#X)
-    set(av_buffer_ref);
-    set(av_buffer_unref);
-    set(av_d2q);
-    set(av_dict_free);
-    set(av_dict_get);
-    set(av_dict_set);
-    set(av_frame_alloc);
-    set(av_frame_free);
-    set(av_frame_get_buffer);
-    set(av_frame_is_writable);
-    set(av_frame_make_writable);
-    set(av_frame_unref);
-    set(av_freep);
-    set(av_get_channel_layout_nb_channels);
-    set(av_get_channel_name);
-    set(av_get_default_channel_layout);
-    set(av_get_media_type_string);
-    set(av_get_pix_fmt);
-    set(av_get_pix_fmt_name);
-    set(av_get_sample_fmt);
-    set(av_get_sample_fmt_name);
-    set(av_get_time_base_q);
-    set(av_hwdevice_ctx_create);
-    set(av_hwframe_ctx_alloc);
-    set(av_hwframe_ctx_init);
-    set(av_hwframe_get_buffer);
-    set(av_log_get_level);
-    set(av_log_set_level);
-    set(av_malloc);
-    set(av_pix_fmt_desc_get);
-    set(av_rescale_q);
-    set(av_sample_fmt_is_planar);
-    set(av_strdup);
-    set(av_strerror);
-    set(avutil_version);
-#undef set
-
-#define set(X) stub.X = (decltype(FFmpegStub::X))libavcodec.sym(#X)
-    set(av_codec_is_decoder);
-    set(av_codec_is_encoder);
-    set(av_codec_iterate);
-    set(av_packet_alloc);
-    set(av_packet_clone);
-    set(av_packet_free);
-    set(av_packet_ref);
-    set(av_packet_rescale_ts);
-    set(av_packet_unref);
-    set(avcodec_alloc_context3);
-    set(avcodec_configuration);
-    set(avcodec_descriptor_get);
-    set(avcodec_find_decoder);
-    set(avcodec_find_decoder_by_name);
-    set(avcodec_find_encoder);
-    set(avcodec_find_encoder_by_name);
-    set(avcodec_flush_buffers);
-    set(avcodec_free_context);
-    set(avcodec_get_hw_config);
-    set(avcodec_get_name);
-    set(avcodec_open2);
-    set(avcodec_parameters_alloc);
-    set(avcodec_parameters_copy);
-    set(avcodec_parameters_free);
-    set(avcodec_parameters_from_context);
-    set(avcodec_parameters_to_context);
-    set(avcodec_receive_frame);
-    set(avcodec_receive_packet);
-    set(avcodec_send_frame);
-    set(avcodec_send_packet);
-    set(avcodec_version);
-#undef set
-
-#define set(X) stub.X = (decltype(FFmpegStub::X))libavformat.sym(#X)
-    set(av_demuxer_iterate);
-    set(av_dump_format);
-    set(av_find_best_stream);
-    set(av_find_input_format);
-    set(av_guess_frame_rate);
-    set(av_interleaved_write_frame);
-    set(av_muxer_iterate);
-    set(av_read_frame);
-    set(av_seek_frame);
-    set(av_write_trailer);
-    set(avio_alloc_context);
-    set(avio_enum_protocols);
-    set(avio_closep);
-    set(avio_flush);
-    set(avio_open2);
-    set(avformat_alloc_context);
-    set(avformat_alloc_output_context2);
-    set(avformat_close_input);
-    set(avformat_find_stream_info);
-    set(avformat_free_context);
-    set(avformat_new_stream);
-    set(avformat_open_input);
-    set(avformat_version);
-    set(avformat_write_header);
-#undef set
-
-#define set(X) stub.X = (decltype(FFmpegStub::X))libavdevice.sym(#X)
-    set(avdevice_register_all);
-    set(avdevice_version);
-#undef set
-
-#define set(X) stub.X = (decltype(FFmpegStub::X))libavfilter.sym(#X)
-    set(av_buffersink_get_frame);
-    set(av_buffersrc_add_frame_flags);
-    set(avfilter_get_by_name);
-    set(avfilter_graph_alloc);
-    set(avfilter_graph_config);
-    set(avfilter_graph_create_filter);
-    set(avfilter_graph_free);
-    set(avfilter_graph_parse_ptr);
-    set(avfilter_inout_alloc);
-    set(avfilter_inout_free);
-    set(avfilter_version);
-#undef set
-  }
-};
-
-static std::unique_ptr<StubImpl> _stub;
-
-void _init_stub() {
-#if defined(_WIN32)
-  _stub = std::make_unique<StubImpl>(
-      "avutil-" AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dll",
-      "avcodec-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dll",
-      "avformat-" AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dll",
-      "avdevice-" AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dll",
-      "avfilter-" AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dll");
-#elif defined(__APPLE__)
-  _stub = std::make_unique<StubImpl>(
-      "libavutil." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dylib",
-      "libavcodec." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dylib",
-      "libavformat." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dylib",
-      "libavdevice." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dylib",
-      "libavfilter." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dylib");
-#else
-  _stub = std::make_unique<StubImpl>(
-      "libavutil.so." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR),
-      "libavcodec.so." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR),
-      "libavformat.so." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR),
-      "libavdevice.so." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR),
-      "libavfilter.so." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR));
-#endif
-}
-
-} // namespace
-
-FFmpegStub& ffmpeg_stub() {
-  static c10::once_flag init_flag;
-  c10::call_once(init_flag, _init_stub);
-  return _stub->stub;
-}
-
-} // namespace torchaudio::io::detail
-
-#endif
diff --git a/torchaudio/csrc/ffmpeg/stub.h b/torchaudio/csrc/ffmpeg/stub.h
deleted file mode 100644
index ae6e0a3d1c..0000000000
--- a/torchaudio/csrc/ffmpeg/stub.h
+++ /dev/null
@@ -1,313 +0,0 @@
-#pragma once
-
-// Abstraction of the access to FFmpeg libraries.
-//
-// Do not include this in header files.
-// Include this header in implementation files and prepend
-// all the calls to libav functions with FFMPEG macro.
-//
-// If DLOPEN_FFMPEG is not defined, FFMPEG macro is empty.
-// In this case, FFmpeg libraries are linked at the time torchaudio is built.
-//
-// If DLOPEN_FFMPEG is defined, FFMPEG macro becomes a function call to
-// fetch a stub instance of FFmpeg libraries.
-// This function also initializes the function pointers by automatically
-// dlopens all the required libraries.
-//
-
-#ifndef DLOPEN_FFMPEG
-#define FFMPEG
-#else
-#define FFMPEG detail::ffmpeg_stub().
-
-#include <torchaudio/csrc/ffmpeg/ffmpeg.h>
-
-namespace torchaudio::io::detail {
-
-struct FFmpegStub;
-
-// dlopen FFmpeg libraries and populate the methods of stub instance,
-// then return the reference to the stub instance
-FFmpegStub& ffmpeg_stub();
-
-struct FFmpegStub {
-  /////////////////////////////////////////////////////////////////////////////
-  // libavutil
-  /////////////////////////////////////////////////////////////////////////////
-
-  AVBufferRef* (*av_buffer_ref)(const AVBufferRef*);
-
-  void (*av_buffer_unref)(AVBufferRef**);
-
-  AVRational (*av_d2q)(double, int) av_const;
-
-  void (*av_dict_free)(AVDictionary**);
-
-  AVDictionaryEntry* (*av_dict_get)(
-      const AVDictionary*,
-      const char*,
-      const AVDictionaryEntry*,
-      int);
-
-  int (*av_dict_set)(AVDictionary**, const char*, const char*, int);
-
-  AVFrame* (*av_frame_alloc)();
-
-  void (*av_frame_free)(AVFrame**);
-
-  int (*av_frame_get_buffer)(AVFrame*, int);
-
-  int (*av_frame_is_writable)(AVFrame*);
-
-  int (*av_frame_make_writable)(AVFrame*);
-
-  void (*av_frame_unref)(AVFrame*);
-
-  void (*av_freep)(void*);
-
-  int (*av_get_channel_layout_nb_channels)(uint64_t);
-
-  const char* (*av_get_channel_name)(uint64_t);
-
-  int64_t (*av_get_default_channel_layout)(int);
-
-  const char* (*av_get_media_type_string)(enum AVMediaType);
-
-  enum AVPixelFormat (*av_get_pix_fmt)(const char*);
-
-  const char* (*av_get_pix_fmt_name)(enum AVPixelFormat);
-
-  enum AVSampleFormat (*av_get_sample_fmt)(const char*);
-
-  const char* (*av_get_sample_fmt_name)(enum AVSampleFormat);
-
-  AVRational (*av_get_time_base_q)();
-
-  int (*av_hwdevice_ctx_create)(
-      AVBufferRef**,
-      enum AVHWDeviceType,
-      const char*,
-      AVDictionary*,
-      int);
-
-  AVBufferRef* (*av_hwframe_ctx_alloc)(AVBufferRef*);
-
-  int (*av_hwframe_ctx_init)(AVBufferRef*);
-
-  int (*av_hwframe_get_buffer)(AVBufferRef*, AVFrame*, int);
-
-  int (*av_log_get_level)();
-
-  void (*av_log_set_level)(int);
-
-  void* (*av_malloc)(size_t);
-
-  const AVPixFmtDescriptor* (*av_pix_fmt_desc_get)(enum AVPixelFormat);
-
-  int64_t (*av_rescale_q)(int64_t, AVRational, AVRational) av_const;
-
-  int (*av_sample_fmt_is_planar)(enum AVSampleFormat);
-
-  char* (*av_strdup)(const char*);
-
-  int (*av_strerror)(int, char*, size_t);
-
-  unsigned (*avutil_version)();
-
-  /////////////////////////////////////////////////////////////////////////////
-  // libavcodec
-  /////////////////////////////////////////////////////////////////////////////
-
-  int (*av_codec_is_decoder)(const AVCodec*);
-
-  int (*av_codec_is_encoder)(const AVCodec*);
-
-  const AVCodec* (*av_codec_iterate)(void**);
-
-  AVPacket* (*av_packet_alloc)();
-
-  AVPacket* (*av_packet_clone)(const AVPacket*);
-
-  void (*av_packet_free)(AVPacket**);
-
-  int (*av_packet_ref)(AVPacket*, const AVPacket*);
-
-  void (*av_packet_rescale_ts)(AVPacket*, AVRational, AVRational);
-
-  void (*av_packet_unref)(AVPacket*);
-
-  AVCodecContext* (*avcodec_alloc_context3)(const AVCodec*);
-
-  const char* (*avcodec_configuration)();
-
-  const AVCodecDescriptor* (*avcodec_descriptor_get)(enum AVCodecID);
-
-  AVCodec* (*avcodec_find_decoder)(enum AVCodecID);
-
-  AVCodec* (*avcodec_find_decoder_by_name)(const char*);
-
-  AVCodec* (*avcodec_find_encoder)(enum AVCodecID);
-
-  AVCodec* (*avcodec_find_encoder_by_name)(const char*);
-
-  void (*avcodec_flush_buffers)(AVCodecContext*);
-
-  void (*avcodec_free_context)(AVCodecContext**);
-
-  const AVCodecHWConfig* (*avcodec_get_hw_config)(const AVCodec*, int);
-
-  const char* (*avcodec_get_name)(enum AVCodecID);
-
-  int (*avcodec_open2)(AVCodecContext*, const AVCodec*, AVDictionary**);
-
-  AVCodecParameters* (*avcodec_parameters_alloc)();
-
-  int (*avcodec_parameters_copy)(AVCodecParameters*, const AVCodecParameters*);
-
-  void (*avcodec_parameters_free)(AVCodecParameters**);
-
-  int (*avcodec_parameters_from_context)(
-      AVCodecParameters*,
-      const AVCodecContext*);
-
-  int (*avcodec_parameters_to_context)(
-      AVCodecContext*,
-      const AVCodecParameters*);
-
-  int (*avcodec_receive_frame)(AVCodecContext*, AVFrame*);
-
-  int (*avcodec_receive_packet)(AVCodecContext*, AVPacket*);
-
-  int (*avcodec_send_frame)(AVCodecContext*, const AVFrame*);
-
-  int (*avcodec_send_packet)(AVCodecContext*, const AVPacket*);
-
-  unsigned (*avcodec_version)();
-
-  /////////////////////////////////////////////////////////////////////////////
-  // libavformat
-  /////////////////////////////////////////////////////////////////////////////
-
-  const AVInputFormat* (*av_demuxer_iterate)(void**);
-
-  void (*av_dump_format)(AVFormatContext*, int, const char*, int);
-
-  int (*av_find_best_stream)(
-      AVFormatContext*,
-      enum AVMediaType,
-      int,
-      int,
-      AVCodec**,
-      int);
-
-  AVInputFormat* (*av_find_input_format)(const char*);
-
-  AVRational (*av_guess_frame_rate)(AVFormatContext*, AVStream*, AVFrame*);
-
-  int (*av_interleaved_write_frame)(AVFormatContext*, AVPacket*);
-
-  const AVOutputFormat* (*av_muxer_iterate)(void**);
-
-  int (*av_read_frame)(AVFormatContext*, AVPacket*);
-
-  int (*av_seek_frame)(AVFormatContext*, int, int64_t, int);
-
-  int (*av_write_trailer)(AVFormatContext* s);
-
-  AVIOContext* (*avio_alloc_context)(
-      unsigned char*,
-      int,
-      int,
-      void*,
-      int (*)(void*, uint8_t*, int),
-      int (*)(void*, uint8_t*, int),
-      int64_t (*)(void*, int64_t, int));
-
-  const char* (*avio_enum_protocols)(void**, int);
-
-  int (*avio_closep)(AVIOContext**);
-
-  void (*avio_flush)(AVIOContext*);
-
-  int (*avio_open2)(
-      AVIOContext**,
-      const char*,
-      int,
-      const AVIOInterruptCB*,
-      AVDictionary**);
-
-  AVFormatContext* (*avformat_alloc_context)();
-
-  int (*avformat_alloc_output_context2)(
-      AVFormatContext**,
-      AVOutputFormat*,
-      const char*,
-      const char*);
-
-  void (*avformat_close_input)(AVFormatContext**);
-
-  int (*avformat_find_stream_info)(AVFormatContext*, AVDictionary**);
-
-  void (*avformat_free_context)(AVFormatContext*);
-
-  AVStream* (*avformat_new_stream)(AVFormatContext*, const AVCodec*);
-
-  int (*avformat_open_input)(
-      AVFormatContext**,
-      const char*,
-      AVFORMAT_CONST AVInputFormat*,
-      AVDictionary**);
-
-  unsigned (*avformat_version)();
-
-  int (*avformat_write_header)(AVFormatContext*, AVDictionary**);
-
-  /////////////////////////////////////////////////////////////////////////////
-  // libavdevice
-  /////////////////////////////////////////////////////////////////////////////
-
-  void (*avdevice_register_all)();
-
-  unsigned (*avdevice_version)();
-
-  /////////////////////////////////////////////////////////////////////////////
-  // libavfilter
-  /////////////////////////////////////////////////////////////////////////////
-
-  int (*av_buffersink_get_frame)(AVFilterContext*, AVFrame*);
-
-  int (*av_buffersrc_add_frame_flags)(AVFilterContext*, AVFrame*, int);
-
-  const AVFilter* (*avfilter_get_by_name)(const char*);
-
-  AVFilterGraph* (*avfilter_graph_alloc)();
-
-  int (*avfilter_graph_config)(AVFilterGraph*, void*);
-
-  int (*avfilter_graph_create_filter)(
-      AVFilterContext**,
-      const AVFilter*,
-      const char*,
-      const char*,
-      void*,
-      AVFilterGraph*);
-
-  void (*avfilter_graph_free)(AVFilterGraph**);
-
-  int (*avfilter_graph_parse_ptr)(
-      AVFilterGraph*,
-      const char*,
-      AVFilterInOut**,
-      AVFilterInOut**,
-      void*);
-
-  AVFilterInOut* (*avfilter_inout_alloc)();
-
-  void (*avfilter_inout_free)(AVFilterInOut**);
-
-  unsigned (*avfilter_version)();
-};
-
-} // namespace torchaudio::io::detail
-
-#endif
diff --git a/torchaudio/csrc/forced_align/cpu/compute.cpp b/torchaudio/csrc/forced_align/cpu/compute.cpp
index da42cf942c..d9f735af47 100644
--- a/torchaudio/csrc/forced_align/cpu/compute.cpp
+++ b/torchaudio/csrc/forced_align/cpu/compute.cpp
@@ -17,8 +17,10 @@ void forced_align_impl(
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   using target_t = typename std::
       conditional<target_scalar_type == torch::kInt, int, int64_t>::type;
-  const auto T = logProbs.size(0);
-  const auto L = targets.size(0);
+  const auto batchIndex =
+      0; // TODO: support batch version and use the real batch index
+  const auto T = logProbs.size(1);
+  const auto L = targets.size(1);
   const auto S = 2 * L + 1;
   torch::Tensor alphas = torch::empty(
                              {2, S},
@@ -27,14 +29,14 @@ void forced_align_impl(
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
   torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
-  auto logProbs_a = logProbs.accessor<scalar_t, 2>();
-  auto targets_a = targets.accessor<target_t, 1>();
-  auto paths_a = paths.accessor<target_t, 1>();
+  auto logProbs_a = logProbs.accessor<scalar_t, 3>();
+  auto targets_a = targets.accessor<target_t, 2>();
+  auto paths_a = paths.accessor<target_t, 2>();
   auto alphas_a = alphas.accessor<scalar_t, 2>();
   auto backPtr_a = backPtr.accessor<int8_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
-    if (targets_a[i] == targets_a[i - 1]) {
+    if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {
       ++R;
     }
   }
@@ -49,20 +51,22 @@ void forced_align_impl(
   auto start = T - (L + R) > 0 ? 0 : 1;
   auto end = (S == 1) ? 1 : 2;
   for (auto i = start; i < end; i++) {
-    auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-    alphas_a[0][i] = logProbs_a[0][labelIdx];
+    auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+    alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx];
   }
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
       if ((start % 2 == 1) &&
-          targets_a[start / 2] != targets_a[start / 2 + 1]) {
+          targets_a[batchIndex][start / 2] !=
+              targets_a[batchIndex][start / 2 + 1]) {
         start = start + 1;
       }
       start = start + 1;
     }
     if (t <= L + R) {
       if (end % 2 == 0 && end < 2 * L &&
-          targets_a[end / 2 - 1] != targets_a[end / 2]) {
+          targets_a[batchIndex][end / 2 - 1] !=
+              targets_a[batchIndex][end / 2]) {
         end = end + 1;
       }
       end = end + 1;
@@ -75,7 +79,7 @@ void forced_align_impl(
     }
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
-          alphas_a[prevIdxOffset][0] + logProbs_a[t][blank];
+          alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
       backPtr_a[t][0] = 0;
       startloop += 1;
     }
@@ -85,13 +89,14 @@ void forced_align_impl(
       auto x1 = alphas_a[prevIdxOffset][i - 1];
       auto x2 = -std::numeric_limits<scalar_t>::infinity();
 
-      auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
+      auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
 
       // In CTC, the optimal path may optionally chose to skip a blank label.
       // x2 represents skipping a letter, and can only happen if we're not
       // currently on a blank_label, and we're not on a repeat letter
       // (i != 1) just ensures we don't access targets[i - 2] if its i < 2
-      if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) {
+      if (i % 2 != 0 && i != 1 &&
+          targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
         x2 = alphas_a[prevIdxOffset][i - 2];
       }
       scalar_t result = 0.0;
@@ -105,7 +110,7 @@ void forced_align_impl(
         result = x0;
         backPtr_a[t][i] = 0;
       }
-      alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx];
+      alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
   }
   auto idx1 = (T - 1) % 2;
@@ -113,8 +118,8 @@ void forced_align_impl(
   // path stores the token index for each time step after force alignment.
   auto indexScores = 0;
   for (auto t = T - 1; t > -1; t--) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[ltrIdx / 2];
-    paths_a[t] = lbl_idx;
+    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
+    paths_a[batchIndex][t] = lbl_idx;
     ++indexScores;
     ltrIdx -= backPtr_a[t][ltrIdx];
   }
@@ -142,30 +147,35 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
   TORCH_CHECK(
-      logProbs.dim() != 3,
-      "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.")
+      logProbs.dim() == 3,
+      "log_probs must be 3-D (batch_size, input length, num classes)");
   TORCH_CHECK(
-      targets.dim() != 2,
-      "2-D tensor is not yet supported for targets, please provide 1-D tensor.")
+      targets.dim() == 2, "targets must be 2-D (batch_size, target length,)");
   TORCH_CHECK(
-      logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)");
-  TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)");
-  TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D");
-  TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D");
+      inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      logProbs.size(0) == 1,
+      "The batch dimension for log_probs must be 1 at the current version.")
+  TORCH_CHECK(
+      targets.size(0) == 1,
+      "The batch dimension for targets must be 1 at the current version.")
   TORCH_CHECK(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
   TORCH_CHECK(
-      logProbs.size(0) == at::max(inputLengths).item().toInt(),
+      logProbs.size(1) == at::max(inputLengths).item().toInt(),
       "input length mismatch");
   TORCH_CHECK(
-      targets.size(0) == at::max(targetLengths).item().toInt(),
+      targets.size(1) == at::max(targetLengths).item().toInt(),
       "target length mismatch");
 
-  const auto T = logProbs.size(0);
+  const auto B = logProbs.size(0);
+  const auto T = logProbs.size(1);
   auto paths = torch::zeros(
-      {T},
+      {B, T},
       torch::TensorOptions().device(targets.device()).dtype(targets.dtype()));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logProbs.scalar_type(), "forced_align_impl", [&] {
@@ -180,9 +190,10 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   return std::make_tuple(
       paths,
       logProbs.index(
-          {torch::linspace(
+          {torch::indexing::Slice(),
+           torch::linspace(
                0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())),
-           paths}));
+           paths.index({0})}));
 }
 
 TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
diff --git a/torchaudio/csrc/forced_align/gpu/compute.cu b/torchaudio/csrc/forced_align/gpu/compute.cu
index d869473831..b23d52f1f3 100644
--- a/torchaudio/csrc/forced_align/gpu/compute.cu
+++ b/torchaudio/csrc/forced_align/gpu/compute.cu
@@ -18,9 +18,9 @@ namespace alignment {
 namespace gpu {
 template <typename scalar_t, typename target_t>
 __global__ void falign_cuda_step_kernel(
-    const torch::PackedTensorAccessor32<scalar_t, 2, torch::RestrictPtrTraits>
+    const torch::PackedTensorAccessor32<scalar_t, 3, torch::RestrictPtrTraits>
         logProbs_a,
-    const torch::PackedTensorAccessor32<target_t, 1, torch::RestrictPtrTraits>
+    const torch::PackedTensorAccessor32<target_t, 2, torch::RestrictPtrTraits>
         targets_a,
     const int T,
     const int L,
@@ -36,6 +36,8 @@ __global__ void falign_cuda_step_kernel(
     torch::PackedTensorAccessor32<int8_t, 2, torch::RestrictPtrTraits>
         backPtrBuffer_a) {
   scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
+  const int batchIndex =
+      0; // TODO: support batch version and use the real batch index
   int S = 2 * L + 1;
   int curIdxOffset = (t % 2); // current time step frame for alpha
   int prevIdxOffset = ((t - 1) % 2); // previous time step frame for alpha
@@ -49,8 +51,8 @@ __global__ void falign_cuda_step_kernel(
   __syncthreads();
   if (t == 0) {
     for (unsigned int i = start + threadIdx.x; i < end; i += blockDim.x) {
-      int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-      alphas_a[curIdxOffset][i] = logProbs_a[0][labelIdx];
+      int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+      alphas_a[curIdxOffset][i] = logProbs_a[batchIndex][0][labelIdx];
     }
     return;
   }
@@ -62,7 +64,7 @@ __global__ void falign_cuda_step_kernel(
   threadMax = kNegInfinity;
   if (start == 0 && threadIdx.x == 0) {
     alphas_a[curIdxOffset][0] =
-        alphas_a[prevIdxOffset][0] + logProbs_a[t][blank];
+        alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
     threadMax = max(threadMax, alphas_a[curIdxOffset][0]);
     backPtrBuffer_a[backPtrBufferLen][0] = 0;
   }
@@ -73,8 +75,9 @@ __global__ void falign_cuda_step_kernel(
     scalar_t x0 = alphas_a[prevIdxOffset][i];
     scalar_t x1 = alphas_a[prevIdxOffset][i - 1];
     scalar_t x2 = kNegInfinity;
-    int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-    if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) {
+    int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+    if (i % 2 != 0 && i != 1 &&
+        targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
       x2 = alphas_a[prevIdxOffset][i - 2];
     }
     scalar_t result = 0.0;
@@ -88,7 +91,7 @@ __global__ void falign_cuda_step_kernel(
       result = x0;
       backPtrBuffer_a[backPtrBufferLen][i] = 0;
     }
-    alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx];
+    alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     threadMax = max(threadMax, alphas_a[curIdxOffset][i]);
   }
   scalar_t maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max());
@@ -113,10 +116,12 @@ void forced_align_impl(
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   using target_t = typename std::
       conditional<target_scalar_type == torch::kInt, int, int64_t>::type;
-  auto paths_a = paths.accessor<target_t, 1>();
-  const int T = logProbs.size(0); // num frames
-  const int N = logProbs.size(1); // alphabet size
-  const int L = targets.size(0); // label length
+  auto paths_a = paths.accessor<target_t, 2>();
+  const int batchIndex =
+      0; // TODO: support batch version and use the real batch index
+  const int T = logProbs.size(1); // num frames
+  const int N = logProbs.size(2); // alphabet size
+  const int L = targets.size(1); // label length
   const int S = 2 * L + 1;
   auto targetsCpu = targets.to(torch::kCPU);
   // backPtrBuffer stores the index offset fthe best path at current position
@@ -144,12 +149,12 @@ void forced_align_impl(
                                  .device(logProbs.device()))
                              .fill_(kNegInfinity);
   // CPU accessors
-  auto targetsCpu_a = targetsCpu.accessor<target_t, 1>();
+  auto targetsCpu_a = targetsCpu.accessor<target_t, 2>();
   auto backPtrCpu_a = backPtrCpu.accessor<int8_t, 2>();
   // count the number of repeats in label
   int R = 0;
   for (int i = 1; i < L; ++i) {
-    if (targetsCpu_a[i] == targetsCpu_a[i - 1]) {
+    if (targetsCpu_a[batchIndex][i] == targetsCpu_a[batchIndex][i - 1]) {
       ++R;
     }
   }
@@ -169,14 +174,16 @@ void forced_align_impl(
     if (t > 0) {
       if (T - t <= L + R) {
         if ((start % 2 == 1) &&
-            (targetsCpu_a[start / 2] != targetsCpu_a[start / 2 + 1])) {
+            (targetsCpu_a[batchIndex][start / 2] !=
+             targetsCpu_a[batchIndex][start / 2 + 1])) {
           start = start + 1;
         }
         start = start + 1;
       }
       if (t <= L + R) {
         if ((end % 2 == 0) && (end < 2 * L) &&
-            (targetsCpu_a[end / 2 - 1] != targetsCpu_a[end / 2])) {
+            (targetsCpu_a[batchIndex][end / 2 - 1] !=
+             targetsCpu_a[batchIndex][end / 2])) {
           end = end + 1;
         }
         end = end + 1;
@@ -184,8 +191,8 @@ void forced_align_impl(
     }
     falign_cuda_step_kernel<scalar_t, target_t>
         <<<1, kNumThreads, 0, defaultStream>>>(
-            logProbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            targets.packed_accessor32<target_t, 1, torch::RestrictPtrTraits>(),
+            logProbs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
+            targets.packed_accessor32<target_t, 2, torch::RestrictPtrTraits>(),
             T,
             L,
             N,
@@ -229,8 +236,9 @@ void forced_align_impl(
       : S - 2;
   int indexScores = 0;
   for (int t = T - 1; t >= 0; --t) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targetsCpu_a[ltrIdx / 2];
-    paths_a[t] = lbl_idx;
+    auto lbl_idx =
+        ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2];
+    paths_a[batchIndex][t] = lbl_idx;
     ++indexScores;
     ltrIdx -= backPtrCpu_a[t][ltrIdx];
   }
@@ -258,30 +266,36 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
   TORCH_CHECK(
-      logProbs.dim() != 3,
-      "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.")
+      logProbs.dim() == 3,
+      "log_probs must be 3-D (batch_size, input length, num classes)");
   TORCH_CHECK(
-      targets.dim() != 2,
-      "2-D tensor is not yet supported for targets, please provide 1-D tensor.")
+      targets.dim() == 2, "targets must be 2-D (batch_size, target length,)");
   TORCH_CHECK(
-      logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)");
-  TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)");
-  TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D");
-  TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D");
+      inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      logProbs.size(0) == 1,
+      "The batch dimension for log_probs must be 1 at the current version.")
+  TORCH_CHECK(
+      targets.size(0) == 1,
+      "The batch dimension for targets must be 1 at the current version.")
   TORCH_CHECK(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
   TORCH_CHECK(
-      logProbs.size(0) == at::max(inputLengths).item().toInt(),
+      logProbs.size(1) == at::max(inputLengths).item().toInt(),
       "input length mismatch");
   TORCH_CHECK(
-      targets.size(0) == at::max(targetLengths).item().toInt(),
+      targets.size(1) == at::max(targetLengths).item().toInt(),
       "target length mismatch");
 
-  auto T = logProbs.size(0); // num frames
+  auto B = logProbs.size(0);
+  auto T = logProbs.size(1); // num frames
   auto paths = torch::zeros(
-      {T}, torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype()));
+      {B, T},
+      torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype()));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logProbs.scalar_type(), "forced_align_impl", [&] {
         if (targets.scalar_type() == torch::kInt64) {
@@ -295,9 +309,10 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   return std::make_tuple(
       paths.to(logProbs.device()),
       logProbs.index(
-          {torch::linspace(
+          {torch::indexing::Slice(),
+           torch::linspace(
                0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())),
-           paths}));
+           paths.index({0})}));
 }
 
 TORCH_LIBRARY_IMPL(torchaudio, CUDA, m) {
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index e6457d299c..8b732cf663 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -2511,12 +2511,12 @@ def forced_align(
 
     Args:
         log_probs (torch.Tensor): log probability of CTC emission output.
-            Tensor of shape `(T, C)`. where `T` is the input length,
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
             `C` is the number of characters in alphabet including blank.
-        targets (torch.Tensor): Target sequence. Tensor of shape `(L,)`,
+        targets (torch.Tensor): Target sequence. Tensor of shape `(B, L)`,
             where `L` is the target length.
-        input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 0-D Tensor (scalar).
-        target_lengths (torch.Tensor): Lengths of the targets. 0-D Tensor (scalar).
+        input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (torch.Tensor): Lengths of the targets. 1-D Tensor of shape `(B,)`.
         blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
 
     Returns:
@@ -2534,6 +2534,9 @@ def forced_align(
 
         where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
         For example, in str `"aabbc"`, the number of repeats are `2`.
+
+    Note:
+        The current version only supports ``batch_size``==1.
     """
     if blank in targets:
         raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")