diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 16eb2a4b65..0000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,473 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
-#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  generate_cache_key:
-    description: "Generates a cache key file that changes daily"
-    steps:
-      - run:
-          name: Generate cache key
-          command: echo "$(date +"%Y-%m-%d")" > .cachekey
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]] || [[ ${CIRCLE_BRANCH} =~ release/* ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-  load_conda_channel_flags:
-    description: "Determines whether we need extra conda channels"
-    steps:
-      - run:
-          name: Adding CONDA_CHANNEL_FLAGS to BASH_ENV
-          command: |
-              CONDA_CHANNEL_FLAGS=""
-              # formerly used to add conda-forge flags for Python 3.9, reserving the mechanism for future python upgrades
-  windows_install_cuda:
-    description: "Install desired CUDA version on Windows runners"
-    steps:
-      - run:
-          name: Install CUDA
-          command: |
-              packaging/windows/internal/cuda_install.bat
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.8)"
-      type: string
-    cuda_version:
-      description: "CUDA version to build against (e.g., cpu, cu101)"
-      type: string
-      default: "cpu"
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: "pytorch/manylinux-cuda116"
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cuda116"
-  environment: &environment
-    PYTHON_VERSION: << parameters.python_version >>
-    BUILD_VERSION: << parameters.build_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    CU_VERSION: << parameters.cuda_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: pytorch/torchaudio_unittest_base:smoke_test-20220425
-  resource_class: large
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - run:
-          command: |
-            pip install --user --progress-bar off jinja2 pyyaml
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: circleci/python:3.8
-    steps:
-      - checkout
-      - run:
-          name: Install pre-commit
-          command: pip install --user --progress-bar off pre-commit
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: always
-          command: git --no-pager diff --color=always
-
-  download_third_parties:
-    docker:
-      - image: "pytorch/torchaudio_unittest_base:manylinux"
-    resource_class: small
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-
-          keys:
-            - tp-nix-v2-{{ checksum ".cachekey" }}
-
-      - run:
-          command: |
-              mkdir -p third_party/archives/
-              wget --no-clobber --directory-prefix=third_party/archives/ $(awk '/URL /{print $2}' third_party/*/CMakeLists.txt)
-      - save_cache:
-
-          key: tp-nix-v2-{{ checksum ".cachekey" }}
-
-          paths:
-            - third_party/archives
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - archives
-
-  build_ffmpeg_linux:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-
-          keys:
-            - ffmpeg-linux-v0-{{ checksum ".cachekey" }}
-
-      - run:
-          command: |
-            export FFMPEG_ROOT=${PWD}/third_party/ffmpeg
-            if [[ ! -d ${FFMPEG_ROOT} ]]; then
-                packaging/ffmpeg/build.sh
-            fi
-      - save_cache:
-
-          key: ffmpeg-linux-v0-{{ checksum ".cachekey" }}
-
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  build_ffmpeg_macos:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-
-          keys:
-            - ffmpeg-macos-v0-{{ checksum ".cachekey" }}
-
-      - run:
-          command: |
-            export FFMPEG_ROOT=${PWD}/third_party/ffmpeg
-            if [[ ! -d ${FFMPEG_ROOT} ]]; then
-                packaging/ffmpeg/build.sh
-            fi
-      - save_cache:
-
-          key: ffmpeg-macos-v0-{{ checksum ".cachekey" }}
-
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  build_ffmpeg_windows:
-    <<: *binary_common
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      # Note:
-      # Unlike other Windows job, this job uses cmd.exe as shell because
-      # we need to invoke bash.exe from msys2 in ffmpeg build process, and doing so
-      # from different installation of bash.exe (the one from the VM) cause issue
-      shell: cmd.exe
-    steps:
-      - checkout
-      - run: date /t > .cachekey
-      - restore_cache:
-
-          keys:
-            - ffmpeg-windows-{{ checksum ".cachekey" }}
-
-      - run: packaging\ffmpeg\build.bat
-      - save_cache:
-
-          key: ffmpeg-windows-{{ checksum ".cachekey" }}
-
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: pytorch/torchaudio_unittest_base:manylinux-20210121
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - attach_workspace:
-          at: third_party
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/linux/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/windows/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      <<: *environment
-      CUDA_VERSION: "11.7"
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/windows/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - load_conda_channel_flags
-      - attach_workspace:
-          at: third_party
-      - designate_upload_channel
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/linux/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-              USE_OPENMP: false
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  stylecheck:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/torchaudio_unittest_base:manylinux"
-    resource_class: medium
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Run style check
-          command: .circleci/unittest/linux/scripts/run_style_checks.sh
-
-workflows:
-  lint:
-    jobs:
-      - lint_python_and_config
-  unittest:
-    jobs:
-      - circleci_consistency
-      - download_third_parties:
-          name: download_third_parties
-      - unittest_linux_cpu:
-          cuda_version: cpu
-          name: unittest_linux_cpu_py3.8
-          python_version: '3.8'
-          requires:
-          - download_third_parties
-      - stylecheck:
-          cuda_version: cpu
-          name: stylecheck_py3.8
-          python_version: '3.8'
-      - unittest_linux_cpu:
-          cuda_version: cpu
-          name: unittest_linux_cpu_py3.9
-          python_version: '3.9'
-          requires:
-          - download_third_parties
-      - unittest_linux_cpu:
-          cuda_version: cpu
-          name: unittest_linux_cpu_py3.10
-          python_version: '3.10'
-          requires:
-          - download_third_parties
-      - unittest_linux_cpu:
-          cuda_version: cpu
-          name: unittest_linux_cpu_py3.11
-          python_version: '3.11'
-          requires:
-          - download_third_parties
-      - unittest_windows_cpu:
-          cuda_version: cpu
-          name: unittest_windows_cpu_py3.8
-          python_version: '3.8'
-          requires:
-          - download_third_parties
-      - unittest_windows_gpu:
-          cuda_version: cu118
-          name: unittest_windows_gpu_py3.8
-          python_version: '3.8'
-          requires:
-          - download_third_parties
-      - unittest_macos_cpu:
-          cuda_version: cpu
-          name: unittest_macos_cpu_py3.8
-          python_version: '3.8'
-          requires:
-          - download_third_parties
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
deleted file mode 100644
index c8e8f12873..0000000000
--- a/.circleci/config.yml.in
+++ /dev/null
@@ -1,426 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.8
-#     - Replace binary_linux_wheel_py3.8 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  generate_cache_key:
-    description: "Generates a cache key file that changes daily"
-    steps:
-      - run:
-          name: Generate cache key
-          command: echo "$(date +"%Y-%m-%d")" > .cachekey
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=nightly
-            # On tags upload to test instead
-            if [[ -n "${CIRCLE_TAG}" ]] || [[ ${CIRCLE_BRANCH} =~ release/* ]]; then
-              our_upload_channel=test
-            fi
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-  load_conda_channel_flags:
-    description: "Determines whether we need extra conda channels"
-    steps:
-      - run:
-          name: Adding CONDA_CHANNEL_FLAGS to BASH_ENV
-          command: |
-              CONDA_CHANNEL_FLAGS=""
-              # formerly used to add conda-forge flags for Python 3.9, reserving the mechanism for future python upgrades
-  windows_install_cuda:
-    description: "Install desired CUDA version on Windows runners"
-    steps:
-      - run:
-          name: Install CUDA
-          command: |
-              packaging/windows/internal/cuda_install.bat
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: ""
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: ""
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.8)"
-      type: string
-    cuda_version:
-      description: "CUDA version to build against (e.g., cpu, cu101)"
-      type: string
-      default: "cpu"
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: "pytorch/manylinux-cuda116"
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cuda116"
-  environment: &environment
-    PYTHON_VERSION: << parameters.python_version >>
-    BUILD_VERSION: << parameters.build_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    CU_VERSION: << parameters.cuda_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: pytorch/torchaudio_unittest_base:smoke_test-20220425
-  resource_class: large
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.8
-    steps:
-      - checkout
-      - run:
-          command: |
-            pip install --user --progress-bar off jinja2 pyyaml
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: circleci/python:3.8
-    steps:
-      - checkout
-      - run:
-          name: Install pre-commit
-          command: pip install --user --progress-bar off pre-commit
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: always
-          command: git --no-pager diff --color=always
-
-  download_third_parties:
-    docker:
-      - image: "pytorch/torchaudio_unittest_base:manylinux"
-    resource_class: small
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-          {% raw %}
-          keys:
-            - tp-nix-v2-{{ checksum ".cachekey" }}
-          {% endraw %}
-      - run:
-          command: |
-              mkdir -p third_party/archives/
-              wget --no-clobber --directory-prefix=third_party/archives/ $(awk '/URL /{print $2}' third_party/*/CMakeLists.txt)
-      - save_cache:
-          {% raw %}
-          key: tp-nix-v2-{{ checksum ".cachekey" }}
-          {% endraw %}
-          paths:
-            - third_party/archives
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - archives
-
-  build_ffmpeg_linux:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-          {% raw %}
-          keys:
-            - ffmpeg-linux-v0-{{ checksum ".cachekey" }}
-          {% endraw %}
-      - run:
-          command: |
-            export FFMPEG_ROOT=${PWD}/third_party/ffmpeg
-            if [[ ! -d ${FFMPEG_ROOT} ]]; then
-                packaging/ffmpeg/build.sh
-            fi
-      - save_cache:
-          {% raw %}
-          key: ffmpeg-linux-v0-{{ checksum ".cachekey" }}
-          {% endraw %}
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  build_ffmpeg_macos:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout
-      - generate_cache_key
-      - restore_cache:
-          {% raw %}
-          keys:
-            - ffmpeg-macos-v0-{{ checksum ".cachekey" }}
-          {% endraw %}
-      - run:
-          command: |
-            export FFMPEG_ROOT=${PWD}/third_party/ffmpeg
-            if [[ ! -d ${FFMPEG_ROOT} ]]; then
-                packaging/ffmpeg/build.sh
-            fi
-      - save_cache:
-          {% raw %}
-          key: ffmpeg-macos-v0-{{ checksum ".cachekey" }}
-          {% endraw %}
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  build_ffmpeg_windows:
-    <<: *binary_common
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      # Note:
-      # Unlike other Windows job, this job uses cmd.exe as shell because
-      # we need to invoke bash.exe from msys2 in ffmpeg build process, and doing so
-      # from different installation of bash.exe (the one from the VM) cause issue
-      shell: cmd.exe
-    steps:
-      - checkout
-      - run: date /t > .cachekey
-      - restore_cache:
-          {% raw %}
-          keys:
-            - ffmpeg-windows-{{ checksum ".cachekey" }}
-          {% endraw %}
-      - run: packaging\ffmpeg\build.bat
-      - save_cache:
-          {% raw %}
-          key: ffmpeg-windows-{{ checksum ".cachekey" }}
-          {% endraw %}
-          paths:
-            - third_party/ffmpeg
-      - persist_to_workspace:
-          root: third_party
-          paths:
-            - ffmpeg
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: pytorch/torchaudio_unittest_base:manylinux-20210121
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - attach_workspace:
-          at: third_party
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/linux/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/windows/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      <<: *environment
-      CUDA_VERSION: "11.7"
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/windows/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - load_conda_channel_flags
-      - attach_workspace:
-          at: third_party
-      - designate_upload_channel
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Install torchaudio
-          command: .circleci/unittest/linux/scripts/install.sh
-          environment:
-              USE_FFMPEG: true
-              USE_OPENMP: false
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-          environment:
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_QUANTIZATION: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece: true
-              TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE: true
-      - store_test_results:
-          path: test-results
-      - store_artifacts:
-          path: test/htmlcov
-
-  stylecheck:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/torchaudio_unittest_base:manylinux"
-    resource_class: medium
-    steps:
-      - checkout
-      - designate_upload_channel
-      - load_conda_channel_flags
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Run style check
-          command: .circleci/unittest/linux/scripts/run_style_checks.sh
-
-workflows:
-  lint:
-    jobs:
-      - lint_python_and_config
-  unittest:
-    jobs:
-      - circleci_consistency
-      {{ unittest_workflows() }}
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
deleted file mode 100755
index f29ba82e6d..0000000000
--- a/.circleci/regenerate.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script should use a very simple, functional programming style.
-Avoid Jinja macros in favor of native Python functions.
-
-Don't go overboard on code generation; use Python only to generate
-content that can't be easily declared statically using CircleCI's YAML API.
-
-Data declarations (e.g. the nested loops for defining the configuration matrix)
-should be at the top of the file for easy updating.
-
-See this comment for design rationale:
-https://github.com/pytorch/vision/pull/1321#issuecomment-531033978
-"""
-
-import os.path
-
-import jinja2
-import yaml
-from jinja2 import select_autoescape
-
-
-PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
-
-
-def build_download_job(filter_branch):
-    job = {
-        "name": "download_third_parties",
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch)
-    return [{"download_third_parties": job}]
-
-
-def build_ffmpeg_job(os_type, filter_branch):
-    job = {
-        "name": f"build_ffmpeg_{os_type}",
-        "requires": ["download_third_parties"],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch)
-    job["python_version"] = "foo"
-    return [{f"build_ffmpeg_{os_type}": job}]
-
-
-def gen_filter_branch_tree(*branches):
-    return {
-        "branches": {
-            "only": list(branches),
-        },
-        "tags": {
-            # Using a raw string here to avoid having to escape
-            # anything
-            "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-        },
-    }
-
-
-def indent(indentation, data_list):
-    return ("\n" + " " * indentation).join(yaml.dump(data_list).splitlines())
-
-
-def unittest_python_versions(os):
-    return {
-        "windows": PYTHON_VERSIONS[:1],
-        "macos": PYTHON_VERSIONS[:1],
-        "linux": PYTHON_VERSIONS,
-    }.get(os)
-
-
-def unittest_workflows(indentation=6):
-    jobs = []
-    jobs += build_download_job(None)
-    for os_type in ["linux", "windows", "macos"]:
-        for device_type in ["cpu", "gpu"]:
-            if os_type != "windows" and device_type == "gpu":
-                continue
-
-            for i, python_version in enumerate(unittest_python_versions(os_type)):
-                job = {
-                    "name": f"unittest_{os_type}_{device_type}_py{python_version}",
-                    "python_version": python_version,
-                    "cuda_version": "cpu" if device_type == "cpu" else "cu118",
-                    "requires": ["download_third_parties"],
-                }
-
-                jobs.append({f"unittest_{os_type}_{device_type}": job})
-
-                if i == 0 and os_type == "linux" and device_type == "cpu":
-                    jobs.append(
-                        {
-                            "stylecheck": {
-                                "name": f"stylecheck_py{python_version}",
-                                "python_version": python_version,
-                                "cuda_version": "cpu",
-                            }
-                        }
-                    )
-    return indent(indentation, jobs)
-
-
-if __name__ == "__main__":
-    d = os.path.dirname(__file__)
-    env = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(d),
-        lstrip_blocks=True,
-        autoescape=select_autoescape(enabled_extensions=("html", "xml")),
-    )
-
-    with open(os.path.join(d, "config.yml"), "w") as f:
-        f.write(
-            env.get_template("config.yml.in").render(
-                unittest_workflows=unittest_workflows,
-            )
-        )
-        f.write("\n")
diff --git a/.circleci/smoke_test/docker/Dockerfile b/.circleci/smoke_test/docker/Dockerfile
deleted file mode 100644
index 4d2bed3aef..0000000000
--- a/.circleci/smoke_test/docker/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# this Dockerfile is for torchaudio smoke test, it will be created periodically via CI system
-# if you need to do it locally, follow below steps once you have Docker installed
-# assuming you're within the directory where this Dockerfile located
-# to test the build use : docker build . -t torchaudio/smoketest
-# to upload the Dockerfile use build_and_push.sh script
-
-FROM ubuntu:latest
-
-RUN apt-get -qq update && apt-get -qq -y install curl bzip2 sox libsox-dev libsox-fmt-all \
-    && curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
-    && bash /tmp/miniconda.sh -bfp /usr/local \
-    && rm -rf /tmp/miniconda.sh \
-    && conda install -c conda-forge gcc \
-    && conda install -y python=3 \
-    && conda update conda \
-    && apt-get -qq -y remove curl bzip2 \
-    && apt-get -qq -y autoremove \
-    && apt-get autoclean \
-    && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log \
-    && conda clean --all --yes
-
-ENV PATH /opt/conda/bin:$PATH
-
-
-RUN conda create -y --name python3.8 python=3.8
-RUN conda create -y --name python3.9 python=3.9
-RUN conda create -y --name python3.10 python=3.10
-
-SHELL [ "/bin/bash", "-c" ]
-RUN echo "source /usr/local/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN source /usr/local/etc/profile.d/conda.sh && conda activate python3.8 && conda install -y -c conda-forge sox && conda install -y numpy
-RUN source /usr/local/etc/profile.d/conda.sh && conda activate python3.9 && conda install -y -c conda-forge sox && conda install -y numpy
-RUN source /usr/local/etc/profile.d/conda.sh && conda activate python3.10 && conda install -y -c conda-forge sox && conda install -y numpy
-CMD [ "/bin/bash"]
diff --git a/.circleci/smoke_test/docker/build_and_push.sh b/.circleci/smoke_test/docker/build_and_push.sh
deleted file mode 100755
index 092d21de09..0000000000
--- a/.circleci/smoke_test/docker/build_and_push.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-datestr="$(date "+%Y%m%d")"
-image="pytorch/torchaudio_unittest_base:smoke_test-${datestr}"
-docker build -t "${image}" .
-docker push "${image}"
diff --git a/.circleci/unittest/linux/README.md b/.circleci/unittest/linux/README.md
deleted file mode 100644
index 0a4b0e0e63..0000000000
--- a/.circleci/unittest/linux/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-This directory contains;
-
- - docker
-   Docker image definition and scripts to build and update Docker image for unittest.
- - scripts
-   Scripts used by CircleCI to run unit tests.
diff --git a/.circleci/unittest/linux/docker/.dockerignore b/.circleci/unittest/linux/docker/.dockerignore
deleted file mode 100644
index 1398d409f8..0000000000
--- a/.circleci/unittest/linux/docker/.dockerignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!scripts
diff --git a/.circleci/unittest/linux/docker/.gitignore b/.circleci/unittest/linux/docker/.gitignore
deleted file mode 100644
index 7e977058dd..0000000000
--- a/.circleci/unittest/linux/docker/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-scripts/build_third_parties.sh
-Dockerfile.tmp
diff --git a/.circleci/unittest/linux/docker/Dockerfile b/.circleci/unittest/linux/docker/Dockerfile
deleted file mode 100644
index c47a896348..0000000000
--- a/.circleci/unittest/linux/docker/Dockerfile
+++ /dev/null
@@ -1,56 +0,0 @@
-FROM ubuntu:18.04 as builder
-
-RUN apt update -q
-
-################################################################################
-# Build Kaldi
-################################################################################
-RUN apt install -q -y \
-        autoconf \
-        automake \
-        bzip2 \
-        g++ \
-        gfortran \
-        git \
-        libatlas-base-dev \
-        libtool \
-        make \
-        python2.7 \
-        python3 \
-        sox \
-        subversion \
-        unzip \
-        wget \
-        zlib1g-dev
-
-# KALDI uses MKL as a default math library, but we are going to copy featbin binaries and dependent
-# shared libraries to the final image, so we use ATLAS, which is easy to reinstall in the final image.
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
-        cd /opt/kaldi/tools && \
-        make -j $(nproc) && \
-        cd /opt/kaldi/src && \
-        ./configure --shared --mathlib=ATLAS --use-cuda=no && \
-        make featbin -j $(nproc)
-
-# Copy featbins and dependent libraries
-ADD ./scripts /scripts
-RUN bash /scripts/copy_kaldi_executables.sh /opt/kaldi /kaldi
-
-################################################################################
-# Build the final image
-################################################################################
-FROM BASE_IMAGE
-RUN apt update && apt install -y \
-        g++ \
-        gfortran \
-        git \
-        libatlas3-base \
-        libsndfile1 \
-        wget \
-        curl \
-        make \
-        file \
-        pkg-config \
-    && rm -rf /var/lib/apt/lists/*
-COPY --from=builder /kaldi /kaldi
-ENV PATH="${PATH}:/kaldi/bin" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/kaldi/lib"
diff --git a/.circleci/unittest/linux/docker/build_and_push.sh b/.circleci/unittest/linux/docker/build_and_push.sh
deleted file mode 100755
index e7ced13ad3..0000000000
--- a/.circleci/unittest/linux/docker/build_and_push.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-if [ $# -ne 1 ]; then
-    printf "Usage %s <CUDA_VERSION>\n\n" "$0"
-    exit 1
-fi
-
-datestr="$(date "+%Y%m%d")"
-if [ "$1" = "cpu" ]; then
-    base_image="ubuntu:18.04"
-    image="pytorch/torchaudio_unittest_base:manylinux-${datestr}"
-else
-    base_image="nvidia/cuda:$1-devel-ubuntu18.04"
-    docker pull "${base_image}"
-    image="pytorch/torchaudio_unittest_base:manylinux-cuda$1-${datestr}"
-fi
-
-cd "$( dirname "${BASH_SOURCE[0]}" )"
-
-# docker build also accepts reading from STDIN
-# but in that case, no context (other files) can be passed, so we write out Dockerfile
-sed "s|BASE_IMAGE|${base_image}|g" Dockerfile > Dockerfile.tmp
-docker build -t "${image}" -f Dockerfile.tmp .
-docker push "${image}"
diff --git a/.circleci/unittest/linux/docker/scripts/copy_kaldi_executables.sh b/.circleci/unittest/linux/docker/scripts/copy_kaldi_executables.sh
deleted file mode 100755
index b0cf207143..0000000000
--- a/.circleci/unittest/linux/docker/scripts/copy_kaldi_executables.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env bash
-
-list_executables() {
-    # List up executables in the given directory
-    find "$1" -type f -executable
-}
-
-list_kaldi_libraries() {
-    # List up shared libraries used by executables found in the given directory ($1)
-    # that reside in Kaldi directory ($2)
-    while read file; do
-        ldd "${file}" | grep -o "${2}.* ";
-    done < <(list_executables "$1") | sort -u
-}
-
-set -euo pipefail
-
-kaldi_root="$(realpath "$1")"
-target_dir="$(realpath "$2")"
-
-bin_dir="${target_dir}/bin"
-lib_dir="${target_dir}/lib"
-
-mkdir -p "${bin_dir}" "${lib_dir}"
-
-# 1. Copy featbins
-printf "Copying executables to %s\n" "${bin_dir}"
-while read file; do
-    printf "  %s\n" "${file}"
-    cp "${file}" "${bin_dir}"
-done < <(list_executables "${kaldi_root}/src/featbin")
-
-# 2. Copy dependent libraries from Kaldi
-printf "Copying libraries to %s\n" "${lib_dir}"
-while read file; do
-    printf "  %s\n" "$file"
-    # If it is not symlink, just copy to the target directory
-    if [ ! -L "${file}" ]; then
-        cp "${file}" "${lib_dir}"
-        continue
-    fi
-
-    # If it is symlink,
-    # 1. Copy the actual library to the target directory.
-    library="$(realpath "${file}")"
-    cp "${library}" "${lib_dir}"
-    # 2. then if the name of the symlink is different from the actual library name,
-    #    create the symlink in the target directory.
-    lib_name="$(basename "${library}")"
-    link_name="$(basename "${file}")"
-    if [ "${lib_name}" != "${link_name}" ]; then
-        printf "    Linking %s -> %s\n" "${lib_name}" "${link_name}"
-        (
-            cd "${lib_dir}"
-            ln -sf "${lib_name}" "${link_name}"
-        )
-    fi
-done < <(list_kaldi_libraries "${bin_dir}" "${kaldi_root}")
diff --git a/.circleci/unittest/linux/scripts/run_clang_format.py b/.circleci/unittest/linux/scripts/run_clang_format.py
deleted file mode 100755
index 250cc6e387..0000000000
--- a/.circleci/unittest/linux/scripts/run_clang_format.py
+++ /dev/null
@@ -1,310 +0,0 @@
-#!/usr/bin/env python
-"""A wrapper script around clang-format, suitable for linting multiple files
-and to use for continuous integration.
-
-This is an alternative API for the clang-format command line.
-It runs over multiple files and directories in parallel.
-A diff output is produced and a sensible exit code is returned.
-
-"""
-
-import argparse
-import codecs
-import difflib
-import fnmatch
-import io
-import multiprocessing
-import os
-import signal
-import subprocess
-import sys
-import traceback
-from functools import partial
-
-try:
-    from subprocess import DEVNULL  # py3k
-except ImportError:
-    DEVNULL = open(os.devnull, "wb")
-
-
-DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu"
-
-
-class ExitStatus:
-    SUCCESS = 0
-    DIFF = 1
-    TROUBLE = 2
-
-
-def list_files(files, recursive=False, extensions=None, exclude=None):
-    if extensions is None:
-        extensions = []
-    if exclude is None:
-        exclude = []
-
-    out = []
-    for file in files:
-        if recursive and os.path.isdir(file):
-            for dirpath, dnames, fnames in os.walk(file):
-                fpaths = [os.path.join(dirpath, fname) for fname in fnames]
-                for pattern in exclude:
-                    # os.walk() supports trimming down the dnames list
-                    # by modifying it in-place,
-                    # to avoid unnecessary directory listings.
-                    dnames[:] = [x for x in dnames if not fnmatch.fnmatch(os.path.join(dirpath, x), pattern)]
-                    fpaths = [x for x in fpaths if not fnmatch.fnmatch(x, pattern)]
-                for f in fpaths:
-                    ext = os.path.splitext(f)[1][1:]
-                    if ext in extensions:
-                        out.append(f)
-        else:
-            out.append(file)
-    return out
-
-
-def make_diff(file, original, reformatted):
-    return list(
-        difflib.unified_diff(
-            original, reformatted, fromfile="{}\t(original)".format(file), tofile="{}\t(reformatted)".format(file), n=3
-        )
-    )
-
-
-class DiffError(Exception):
-    def __init__(self, message, errs=None):
-        super(DiffError, self).__init__(message)
-        self.errs = errs or []
-
-
-class UnexpectedError(Exception):
-    def __init__(self, message, exc=None):
-        super(UnexpectedError, self).__init__(message)
-        self.formatted_traceback = traceback.format_exc()
-        self.exc = exc
-
-
-def run_clang_format_diff_wrapper(args, file):
-    try:
-        ret = run_clang_format_diff(args, file)
-        return ret
-    except DiffError:
-        raise
-    except Exception as e:
-        raise UnexpectedError("{}: {}: {}".format(file, e.__class__.__name__, e), e)
-
-
-def run_clang_format_diff(args, file):
-    try:
-        with io.open(file, "r", encoding="utf-8") as f:
-            original = f.readlines()
-    except IOError as exc:
-        raise DiffError(str(exc))
-    invocation = [args.clang_format_executable, file]
-
-    # Use of utf-8 to decode the process output.
-    #
-    # Hopefully, this is the correct thing to do.
-    #
-    # It's done due to the following assumptions (which may be incorrect):
-    # - clang-format will returns the bytes read from the files as-is,
-    #   without conversion, and it is already assumed that the files use utf-8.
-    # - if the diagnostics were internationalized, they would use utf-8:
-    #   > Adding Translations to Clang
-    #   >
-    #   > Not possible yet!
-    #   > Diagnostic strings should be written in UTF-8,
-    #   > the client can translate to the relevant code page if needed.
-    #   > Each translation completely replaces the format string
-    #   > for the diagnostic.
-    #   > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation
-
-    try:
-        proc = subprocess.Popen(
-            invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, encoding="utf-8"
-        )
-    except OSError as exc:
-        raise DiffError("Command '{}' failed to start: {}".format(subprocess.list2cmdline(invocation), exc))
-    proc_stdout = proc.stdout
-    proc_stderr = proc.stderr
-
-    # hopefully the stderr pipe won't get full and block the process
-    outs = list(proc_stdout.readlines())
-    errs = list(proc_stderr.readlines())
-    proc.wait()
-    if proc.returncode:
-        raise DiffError(
-            "Command '{}' returned non-zero exit status {}".format(
-                subprocess.list2cmdline(invocation), proc.returncode
-            ),
-            errs,
-        )
-    return make_diff(file, original, outs), errs
-
-
-def bold_red(s):
-    return "\x1b[1m\x1b[31m" + s + "\x1b[0m"
-
-
-def colorize(diff_lines):
-    def bold(s):
-        return "\x1b[1m" + s + "\x1b[0m"
-
-    def cyan(s):
-        return "\x1b[36m" + s + "\x1b[0m"
-
-    def green(s):
-        return "\x1b[32m" + s + "\x1b[0m"
-
-    def red(s):
-        return "\x1b[31m" + s + "\x1b[0m"
-
-    for line in diff_lines:
-        if line[:4] in ["--- ", "+++ "]:
-            yield bold(line)
-        elif line.startswith("@@ "):
-            yield cyan(line)
-        elif line.startswith("+"):
-            yield green(line)
-        elif line.startswith("-"):
-            yield red(line)
-        else:
-            yield line
-
-
-def print_diff(diff_lines, use_color):
-    if use_color:
-        diff_lines = colorize(diff_lines)
-    sys.stdout.writelines(diff_lines)
-
-
-def print_trouble(prog, message, use_colors):
-    error_text = "error:"
-    if use_colors:
-        error_text = bold_red(error_text)
-    print("{}: {} {}".format(prog, error_text, message), file=sys.stderr)
-
-
-def main():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--clang-format-executable",
-        metavar="EXECUTABLE",
-        help="path to the clang-format executable",
-        default="clang-format",
-    )
-    parser.add_argument(
-        "--extensions",
-        help="comma separated list of file extensions (default: {})".format(DEFAULT_EXTENSIONS),
-        default=DEFAULT_EXTENSIONS,
-    )
-    parser.add_argument("-r", "--recursive", action="store_true", help="run recursively over directories")
-    parser.add_argument("files", metavar="file", nargs="+")
-    parser.add_argument("-q", "--quiet", action="store_true")
-    parser.add_argument(
-        "-j",
-        metavar="N",
-        type=int,
-        default=0,
-        help="run N clang-format jobs in parallel" " (default number of cpus + 1)",
-    )
-    parser.add_argument(
-        "--color", default="auto", choices=["auto", "always", "never"], help="show colored diff (default: auto)"
-    )
-    parser.add_argument(
-        "-e",
-        "--exclude",
-        metavar="PATTERN",
-        action="append",
-        default=[],
-        help="exclude paths matching the given glob-like pattern(s)" " from recursive search",
-    )
-
-    args = parser.parse_args()
-
-    # use default signal handling, like diff return SIGINT value on ^C
-    # https://bugs.python.org/issue14229#msg156446
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-    try:
-        signal.SIGPIPE
-    except AttributeError:
-        # compatibility, SIGPIPE does not exist on Windows
-        pass
-    else:
-        signal.signal(signal.SIGPIPE, signal.SIG_DFL)
-
-    colored_stdout = False
-    colored_stderr = False
-    if args.color == "always":
-        colored_stdout = True
-        colored_stderr = True
-    elif args.color == "auto":
-        colored_stdout = sys.stdout.isatty()
-        colored_stderr = sys.stderr.isatty()
-
-    version_invocation = [args.clang_format_executable, str("--version")]
-    try:
-        subprocess.check_call(version_invocation, stdout=DEVNULL)
-    except subprocess.CalledProcessError as e:
-        print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-        return ExitStatus.TROUBLE
-    except OSError as e:
-        print_trouble(
-            parser.prog,
-            "Command '{}' failed to start: {}".format(subprocess.list2cmdline(version_invocation), e),
-            use_colors=colored_stderr,
-        )
-        return ExitStatus.TROUBLE
-
-    retcode = ExitStatus.SUCCESS
-    files = list_files(
-        args.files, recursive=args.recursive, exclude=args.exclude, extensions=args.extensions.split(",")
-    )
-
-    if not files:
-        return
-
-    njobs = args.j
-    if njobs == 0:
-        njobs = multiprocessing.cpu_count() + 1
-    njobs = min(len(files), njobs)
-
-    if njobs == 1:
-        # execute directly instead of in a pool,
-        # less overhead, simpler stacktraces
-        it = (run_clang_format_diff_wrapper(args, file) for file in files)
-        pool = None
-    else:
-        pool = multiprocessing.Pool(njobs)
-        it = pool.imap_unordered(partial(run_clang_format_diff_wrapper, args), files)
-    while True:
-        try:
-            outs, errs = next(it)
-        except StopIteration:
-            break
-        except DiffError as e:
-            print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-            retcode = ExitStatus.TROUBLE
-            sys.stderr.writelines(e.errs)
-        except UnexpectedError as e:
-            print_trouble(parser.prog, str(e), use_colors=colored_stderr)
-            sys.stderr.write(e.formatted_traceback)
-            retcode = ExitStatus.TROUBLE
-            # stop at the first unexpected error,
-            # something could be very wrong,
-            # don't process all files unnecessarily
-            if pool:
-                pool.terminate()
-            break
-        else:
-            sys.stderr.writelines(errs)
-            if outs == []:
-                continue
-            if not args.quiet:
-                print_diff(outs, use_color=colored_stdout)
-            if retcode == ExitStatus.SUCCESS:
-                retcode = ExitStatus.DIFF
-    return retcode
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/.circleci/unittest/linux/scripts/run_style_checks.sh b/.circleci/unittest/linux/scripts/run_style_checks.sh
deleted file mode 100755
index 0620f4867e..0000000000
--- a/.circleci/unittest/linux/scripts/run_style_checks.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-
-set -eux
-
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-eval "$("${conda_dir}/bin/conda" shell.bash hook)"
-conda activate "${env_dir}"
-
-# 1. Install tools
-conda install -y flake8==3.9.2
-printf "Installed flake8: "
-flake8 --version
-
-clangformat_path="${root_dir}/clang-format"
-curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o "${clangformat_path}"
-chmod +x "${clangformat_path}"
-printf "Installed clang-fortmat"
-"${clangformat_path}" --version
-
-# 2. Run style checks
-# We want to run all the style checks even if one of them fail.
-
-set +e
-
-exit_status=0
-
-printf "\x1b[34mRunning flake8:\x1b[0m\n"
-flake8 torchaudio test tools/setup_helpers docs/source/conf.py examples
-status=$?
-exit_status="$((exit_status+status))"
-if [ "${status}" -ne 0 ]; then
-    printf "\x1b[31mflake8 failed. Check the format of Python files.\x1b[0m\n"
-fi
-
-printf "\x1b[34mRunning clang-format:\x1b[0m\n"
-"${this_dir}"/run_clang_format.py \
-  -r torchaudio/csrc third_party/kaldi/src \
-  --clang-format-executable "${clangformat_path}" \
-    && git diff --exit-code
-status=$?
-exit_status="$((exit_status+status))"
-if [ "${status}" -ne 0 ]; then
-    printf "\x1b[31mC++ files are not formatted. Please use clang-format to format CPP files.\x1b[0m\n"
-fi
-exit $exit_status
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
deleted file mode 100755
index f807193227..0000000000
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchaudio here, otherwise they also get cached.
-
-set -ex
-
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    curl --silent -L -o miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$("${conda_dir}/bin/conda" shell.bash hook)"
-
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment with PYTHON_VERSION=%s\n" "${PYTHON_VERSION}\n"
-    conda create --prefix "${env_dir}" -y python="${PYTHON_VERSION}"
-fi
-conda activate "${env_dir}"
-
-# 3. Install minimal build tools
-pip --quiet install cmake ninja
-conda install --quiet -y -c conda-forge 'ffmpeg==5.1' pkg-config
diff --git a/.circleci/unittest/windows/README.md b/.circleci/unittest/windows/README.md
deleted file mode 100644
index 2c06af62bd..0000000000
--- a/.circleci/unittest/windows/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-This directory contains;
-
- - scripts
-   Scripts used by CircleCI to run unit tests.
diff --git a/.flake8 b/.flake8
index b4e3878df3..dae85fef0d 100644
--- a/.flake8
+++ b/.flake8
@@ -1,4 +1,12 @@
 [flake8]
+# Note: it's recommended to use `pre-commit run -a flake8`
+
 max-line-length = 120
-ignore = E203,E305,E402,E721,E741,F405,W503,W504,F999
-exclude = build,docs/source,_ext,third_party,examples/tutorials
+ignore = E203,E402,E741,W503
+
+# Note: exclude is not honnored when flake8 is executed from pre-commit.
+# pre-commit has a separate config
+exclude = build,docs/src,third_party
+
+per-file-ignores =
+  examples/tutorials/*.py: E501
diff --git a/.github/process_commit.py b/.github/process_commit.py
index a068fc5d27..d369006ffb 100644
--- a/.github/process_commit.py
+++ b/.github/process_commit.py
@@ -8,7 +8,7 @@
 import json
 import os
 import sys
-from typing import Any, Optional, Set, Tuple
+from typing import Any, Optional, Set
 
 import requests
 
@@ -84,12 +84,12 @@ def post_github_comment(pr_number: int, merger: str) -> Any:
 and ML-related components under 'torchaudio/csrc' (e.g. RNN-T loss).
 
 Things in "examples" directory:
-- 'recipe' is applicable to training recipes under the 'examples' folder,  
+- 'recipe' is applicable to training recipes under the 'examples' folder,
 - 'tutorial' is applicable to tutorials under the “examples/tutorials” folder
-- 'example' is applicable to everything else (e.g. C++ examples) 
-- 'module: docs' is applicable to code documentations (not to tutorials). \
+- 'example' is applicable to everything else (e.g. C++ examples)
+- 'module: docs' is applicable to code documentations (not to tutorials).
 
-Regarding examples in code documentations, please also use 'module: docs'. 
+Regarding examples in code documentations, please also use 'module: docs'.
 
 Please use 'other' tag only when you’re sure the changes are not much relevant to users, \
 or when all other tags are not applicable. Try not to use it often, in order to minimize \
@@ -98,7 +98,7 @@ def post_github_comment(pr_number: int, merger: str) -> Any:
 ---
 
 When preparing release notes, please make sure 'documentation' and 'tutorials' occur as the \
-last sub-categories under each primary category like 'new feature', 'improvements' or 'prototype'. 
+last sub-categories under each primary category like 'new feature', 'improvements' or 'prototype'.
 
 Things related to build are by default excluded from the release note, \
 except when it impacts users. For example:
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.github/scripts/unittest-linux/install.sh
similarity index 76%
rename from .circleci/unittest/linux/scripts/install.sh
rename to .github/scripts/unittest-linux/install.sh
index 487e92d638..9ed663be89 100755
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.github/scripts/unittest-linux/install.sh
@@ -1,27 +1,24 @@
 #!/usr/bin/env bash
 
+# NOTE
+# Currently Linux GPU code has separate run script hardcoded in GHA YAML file.
+# Therefore the CUDA-related things in this script is not used, and it's broken.
+# TODO: Migrate GHA Linux GPU test job to this script.
+
 unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
+# No need to set PYTORCH_VERSION for unit test, as we use nightly PyTorch.
 # In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
 
 set -e
 
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
 case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
+    Darwin*)
+        os=MacOSX;;
+    *)
+        os=Linux
+        eval "$("/opt/conda/bin/conda" shell.bash hook)"
 esac
 
-# 0. Activate conda env
-eval "$("${conda_dir}/bin/conda" shell.bash hook)"
-conda activate "${env_dir}"
-
 # 1. Install PyTorch
 if [ -z "${CUDA_VERSION:-}" ] ; then
     if [ "${os}" == MacOSX ] ; then
@@ -57,6 +54,8 @@ printf "Installing PyTorch with %s\n" "${cudatoolkit}"
 )
 
 # 2. Install torchaudio
+conda install --quiet -y 'ffmpeg>=4.1' ninja cmake
+
 printf "* Installing torchaudio\n"
 python setup.py install
 
@@ -72,7 +71,7 @@ fi
 (
     set -x
     conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa==0.10.0' parameterized 'requests>=2.20'
-    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm
+    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.github/scripts/unittest-linux/run_test.sh
similarity index 76%
rename from .circleci/unittest/linux/scripts/run_test.sh
rename to .github/scripts/unittest-linux/run_test.sh
index 3db02c627a..b56677acb2 100755
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ b/.github/scripts/unittest-linux/run_test.sh
@@ -2,8 +2,7 @@
 
 set -e
 
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
+eval "$(/opt/conda/bin/conda shell.bash hook)"
 
 python -m torch.utils.collect_env
 env | grep TORCHAUDIO || true
@@ -13,7 +12,7 @@ export PATH="${PWD}/third_party/install/bin/:${PATH}"
 declare -a args=(
     '-v'
     '--cov=torchaudio'
-    "--junitxml=${PWD}/test-results/junit.xml"
+    "--junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml"
     '--durations' '20'
 )
 
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.github/scripts/unittest-windows/environment.yml
similarity index 100%
rename from .circleci/unittest/windows/scripts/environment.yml
rename to .github/scripts/unittest-windows/environment.yml
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.github/scripts/unittest-windows/install.sh
similarity index 95%
rename from .circleci/unittest/windows/scripts/install.sh
rename to .github/scripts/unittest-windows/install.sh
index 7a5111ddfb..676de7e830 100644
--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.github/scripts/unittest-windows/install.sh
@@ -5,7 +5,7 @@ unset PYTORCH_VERSION
 # so no need to set PYTORCH_VERSION.
 # In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
 
-set -ex
+set -euxo pipefail
 
 root_dir="$(git rev-parse --show-toplevel)"
 conda_dir="${root_dir}/conda"
@@ -43,6 +43,9 @@ if [ ! -z "${CUDA_VERSION:-}" ] ; then
 fi
 
 # 2. Install torchaudio
+printf "* Installing fsspec\n"
+pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org fsspec
+
 printf "* Installing torchaudio\n"
 "$root_dir/packaging/vc_env_helper.bat" python setup.py install
 
@@ -64,7 +67,6 @@ case "$(python --version)" in
 esac
 # Note: installing librosa via pip fail because it will try to compile numba.
 (
-    set -x
     conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa==0.10.0' parameterized 'requests>=2.20'
     # Need to disable shell check since this'll fail out if SENTENCEPIECE_DEPENDENCY is empty
     # shellcheck disable=SC2086
@@ -80,7 +82,6 @@ esac
         pytest-cov \
         pytorch-lightning \
         'scipy==1.7.3' \
-        transformers \
         unidecode \
         'protobuf<4.21.0' \
         demucs \
diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.github/scripts/unittest-windows/install_conda.bat
similarity index 100%
rename from .circleci/unittest/windows/scripts/install_conda.bat
rename to .github/scripts/unittest-windows/install_conda.bat
diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.github/scripts/unittest-windows/run_test.sh
similarity index 70%
rename from .circleci/unittest/windows/scripts/run_test.sh
rename to .github/scripts/unittest-windows/run_test.sh
index 22a53911e3..5ac228a0e1 100644
--- a/.circleci/unittest/windows/scripts/run_test.sh
+++ b/.github/scripts/unittest-windows/run_test.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-set -ex
+set -euxo pipefail
 
 eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
 conda activate ./env
@@ -12,5 +12,5 @@ python -m torch.utils.collect_env
 env | grep TORCHAUDIO || true
 
 cd test
-pytest --cov=torchaudio --junitxml=../test-results/junit.xml -v --durations 20 torchaudio_unittest
+pytest --cov=torchaudio --junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml -v --durations 20 torchaudio_unittest
 coverage html
diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.github/scripts/unittest-windows/set_cuda_envs.sh
similarity index 73%
rename from .circleci/unittest/windows/scripts/set_cuda_envs.sh
rename to .github/scripts/unittest-windows/set_cuda_envs.sh
index e2d014b27b..ea3a126d03 100644
--- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ b/.github/scripts/unittest-windows/set_cuda_envs.sh
@@ -1,18 +1,10 @@
 #!/usr/bin/env bash
-set -ex
+set -euxo pipefail
 
-echo CU_VERSION is "${CU_VERSION}"
-echo CUDA_VERSION is "${CUDA_VERSION}"
-
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
-# to understand this code, please checck out https://github.com/pytorch/vision/issues/4443
-version="cpu"
-if [[ ! -z "${CUDA_VERSION}" ]] ; then
-    version="$CUDA_VERSION"
+if [ -z "${CUDA_VERSION:-}" ] ; then
+    version="cpu"
 else
-    if [[ ${#CU_VERSION} -eq 5 ]]; then
-        version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
+    version="$CUDA_VERSION"
 fi
 
 # Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.github/scripts/unittest-windows/setup_env.sh
similarity index 95%
rename from .circleci/unittest/windows/scripts/setup_env.sh
rename to .github/scripts/unittest-windows/setup_env.sh
index 578c6d2404..88ac15ca5c 100644
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ b/.github/scripts/unittest-windows/setup_env.sh
@@ -5,7 +5,7 @@
 #
 # Do not install PyTorch and torchaudio here, otherwise they also get cached.
 
-set -e
+set -euxo pipefail
 
 this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 root_dir="$(git rev-parse --show-toplevel)"
@@ -35,4 +35,4 @@ conda activate "${env_dir}"
 
 # 3. Install minimal build tools
 pip --quiet install cmake ninja
-conda install --quiet -y -c conda-forge 'ffmpeg==5.1'
+conda install --quiet -y 'ffmpeg>=4.1'
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
index a23d6e5dac..7d30dd8c61 100644
--- a/.github/workflows/build-conda-m1.yml
+++ b/.github/workflows/build-conda-m1.yml
@@ -32,8 +32,11 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: macos-m1-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
+            conda-package-directory: packaging/torchaudio
             smoke-test-script: test/smoke_test/smoke_test.py
             package-name: torchaudio
     name: ${{ matrix.repository }}
@@ -51,5 +54,7 @@ jobs:
       runner-type: macos-m1-12
       package-name: ${{ matrix.package-name }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
index 4d1fbbaa29..e6a8c87b10 100644
--- a/.github/workflows/build-conda-windows.yml
+++ b/.github/workflows/build-conda-windows.yml
@@ -28,12 +28,13 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: windows-ffmpeg
             pre-script: packaging/ffmpeg/build.bat
-            wheel-build-params: "--plat-name win_amd64"
             post-script: ""
-            package-name: torchaudio
-            conda-package-directory: packaging/torchaudio
             smoke-test-script: test/smoke_test/smoke_test.py
+            conda-package-directory: packaging/torchaudio
+            package-name: torchaudio
     name: ${{ matrix.repository }}
     uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@main
     with:
@@ -48,5 +49,7 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
index bee897f140..33fae4f19b 100644
--- a/.github/workflows/build-wheels-m1.yml
+++ b/.github/workflows/build-wheels-m1.yml
@@ -32,6 +32,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: macos-m1-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test/smoke_test.py
@@ -50,6 +52,8 @@ jobs:
       runner-type: macos-m1-12
       package-name: ${{ matrix.package-name }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build_conda_linux.yml b/.github/workflows/build_conda_linux.yml
index d3cfcbaefe..60a46f25f1 100644
--- a/.github/workflows/build_conda_linux.yml
+++ b/.github/workflows/build_conda_linux.yml
@@ -32,9 +32,12 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: linux-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test/smoke_test.py
+            conda-package-directory: packaging/torchaudio
             package-name: torchaudio
     name: ${{ matrix.repository }}
     uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@main
@@ -50,5 +53,7 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build_conda_macos.yml b/.github/workflows/build_conda_macos.yml
index 2944829f47..8e29ac5155 100644
--- a/.github/workflows/build_conda_macos.yml
+++ b/.github/workflows/build_conda_macos.yml
@@ -32,8 +32,11 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: macos-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
+            conda-package-directory: packaging/torchaudio
             smoke-test-script: test/smoke_test/smoke_test.py
             package-name: torchaudio
     name: ${{ matrix.repository }}
@@ -51,5 +54,7 @@ jobs:
       runner-type: macos-12
       package-name: ${{ matrix.package-name }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index bafe79ac4c..55ca11da43 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -106,11 +106,14 @@ jobs:
           "${CUDATOOLKIT}"
 
         # Install torchaudio
+        # TODO: Enable NVDec/NVEnc
+        conda install --quiet -y 'ffmpeg>=4.1' pkg-config
         pip --quiet install cmake>=3.18.0 ninja
-        cd packaging
-        . ./pkg_helpers.bash
-        setup_build_version
-        cd ../
+        BUILD_VERSION=$( cut -f 1 -d a version.txt )
+        BUILD_VERSION=$BUILD_VERSION.dev$(date "+%Y%m%d")
+        export BUILD_VERSION
+
+
         FFMPEG_ROOT=${CONDA_PREFIX} USE_FFMPEG=1 USE_CUDA=1 pip install --progress-bar off -v -e . --no-use-pep517
 
         # Install runtime dependencies
diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
index f58824badb..c2febd674a 100644
--- a/.github/workflows/build_wheels_linux.yml
+++ b/.github/workflows/build_wheels_linux.yml
@@ -32,6 +32,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: linux-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test/smoke_test.py
diff --git a/.github/workflows/build_wheels_macos.yml b/.github/workflows/build_wheels_macos.yml
index 55010862e9..80eeb92203 100644
--- a/.github/workflows/build_wheels_macos.yml
+++ b/.github/workflows/build_wheels_macos.yml
@@ -32,6 +32,8 @@ jobs:
       matrix:
         include:
           - repository: pytorch/audio
+            cache-path: /third_party/ffmpeg
+            cache-key: macos-ffmpeg
             pre-script: packaging/pre_build_script.sh
             post-script: packaging/post_build_script.sh
             smoke-test-script: test/smoke_test/smoke_test.py
@@ -50,6 +52,8 @@ jobs:
       runner-type: macos-12
       package-name: ${{ matrix.package-name }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build_wheels_windows.yml b/.github/workflows/build_wheels_windows.yml
index 7c9ef1df32..d870eb3e3a 100644
--- a/.github/workflows/build_wheels_windows.yml
+++ b/.github/workflows/build_wheels_windows.yml
@@ -30,9 +30,11 @@ jobs:
           - repository: pytorch/audio
             pre-script: packaging/ffmpeg/build.bat
             env-script: packaging/vc_env_helper.bat
+            cache-path: /third_party/ffmpeg
+            cache-key: windows-ffmpeg
             wheel-build-params: "--plat-name win_amd64"
             post-script: ""
-            smoke-test-script: "test/smoke_test/smoke_test.py"
+            smoke-test-script: test/smoke_test/smoke_test.py
             package-name: torchaudio
     name: ${{ matrix.repository }}
     uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main
@@ -48,6 +50,8 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
+      cache-path: ${{ matrix.cache-path }}
+      cache-key: ${{ matrix.cache-key }}
     secrets:
       AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/docstring_parameters_sync.yml b/.github/workflows/docstring_parameters_sync.yml
deleted file mode 100644
index 398aff1c8d..0000000000
--- a/.github/workflows/docstring_parameters_sync.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Docstring Parameters Sync
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-  workflow_dispatch:
-jobs:
-  check-docstring-sync:
-    name: "Check whether the docstring parameters are in-sync"
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/conda-builder:cpu
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Setup Minconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          miniconda-version: "latest"
-          python-version: 3.8
-      - name: Create Conda Env
-        shell: bash -l {0}
-        run: |
-          conda clean --all --quiet --yes
-          CONDA_ENV="${RUNNER_TEMP}/conda_environment_${GITHUB_RUN_ID}"
-          conda create \
-            --yes \
-            --prefix "${CONDA_ENV}" \
-            "python=3.8"
-          echo "CONDA_ENV=${CONDA_ENV}" >> "${GITHUB_ENV}"
-          echo "CONDA_RUN=conda run -p ${CONDA_ENV}" >> "${GITHUB_ENV}"
-      - name: Run pydocstyle
-        shell: bash -l {0}
-        run: |
-          ${CONDA_RUN} pip install pydocstyle
-          ${CONDA_RUN} pydocstyle torchaudio
diff --git a/.github/workflows/ffmpeg.yml b/.github/workflows/ffmpeg.yml
new file mode 100644
index 0000000000..9c5bed7800
--- /dev/null
+++ b/.github/workflows/ffmpeg.yml
@@ -0,0 +1,107 @@
+# This job is not directly related to regular CI pipeline.
+# It is intended to create FFmpeg binaries that we upload on S3,
+# which then will be used during all the build process in CI or local.
+#
+# This job does not include uploading part.
+# Upload needs to be done manually, and it should be done only once
+# par new major release of FFmepg.
+name: FFmpeg Binaries
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0'  # on sunday
+
+jobs:
+  LGPL-Linux-x86_64:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux_x86_64/"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  LGPL-Linux-aarch64:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl
+      repository: pytorch/audio
+      runner: "linux.t4g.2xlarge"
+      no-sudo: true
+      docker-image: quay.io/pypa/manylinux2014_aarch64
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux_aarch64/"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  LGPL-macOS:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+        runner: ["macos-m1-12", "macos-12"]
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl
+      repository: pytorch/audio
+      runner: "${{ matrix.runner }}"
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/macos_$(uname -m)"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  LGPL-Windows:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.bat
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/bin
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/windows"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000000..69d49dc5c9
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,37 @@
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  python-source-and-configs:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      repository: pytorch/audio
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        eval "$("$(which conda)" shell.bash hook)"
+        pip install --progress-bar=off pre-commit
+        echo '::endgroup::'
+
+        set +e
+        pre-commit run --all-files --show-diff-on-failure
+        status=$?
+
+        echo '::group::Add Summry'
+        if [ $status -ne 0 ]; then
+          echo '### Lint failure'  >> $GITHUB_STEP_SUMMARY
+          echo '```diff'           >> $GITHUB_STEP_SUMMARY
+          git --no-pager diff      >> $GITHUB_STEP_SUMMARY
+          echo '```'               >> $GITHUB_STEP_SUMMARY
+        fi
+        echo '::endgroup::'
+        exit $status
diff --git a/.github/workflows/unittest-linux-cpu.yml b/.github/workflows/unittest-linux-cpu.yml
index 1403c175dc..d179ca8bc2 100644
--- a/.github/workflows/unittest-linux-cpu.yml
+++ b/.github/workflows/unittest-linux-cpu.yml
@@ -54,14 +54,10 @@ jobs:
 
         set -euxo pipefail
 
-        echo '::group::Setup Conda Environment'
-        ./.circleci/unittest/linux/scripts/setup_env.sh
-        echo '::endgroup::'
-
         echo '::group::Install PyTorch and Torchaudio'
-        ./.circleci/unittest/linux/scripts/install.sh
+        ./.github/scripts/unittest-linux/install.sh
         echo '::endgroup::'
 
         echo '::group::Run Tests'
-        ./.circleci/unittest/linux/scripts/run_test.sh
+        ./.github/scripts/unittest-linux/run_test.sh
         echo '::endgroup::'
diff --git a/.github/workflows/unittest-linux-gpu.yml b/.github/workflows/unittest-linux-gpu.yml
index 057bddc0e0..b6bb10daf6 100644
--- a/.github/workflows/unittest-linux-gpu.yml
+++ b/.github/workflows/unittest-linux-gpu.yml
@@ -55,7 +55,7 @@ jobs:
           "${CUDATOOLKIT}"
 
         # Install torchaudio
-        conda install --quiet -y -c conda-forge 'ffmpeg==5.1' pkg-config
+        conda install --quiet -y 'ffmpeg>=4.1' pkg-config
         python3 -m pip --quiet install cmake>=3.18.0 ninja
         USE_FFMPEG=1 python3 -m pip install -v -e . --no-use-pep517
 
@@ -74,11 +74,12 @@ jobs:
         export TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY=true
         export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL=true
         export TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_TEMPORARY_DISABLED=true
 
         declare -a args=(
             '-v'
             '--cov=torchaudio'
-            "--junitxml=${PWD}/test-results/junit.xml"
+            "--junitxml=${RUNNER_TEST_RESULTS_DIR}/junit.xml"
             '--durations' '100'
             '-k' 'cuda or gpu'
         )
diff --git a/.github/workflows/unittest-macos-cpu.yml b/.github/workflows/unittest-macos-cpu.yml
index 962d8949c1..bc386aa3cc 100644
--- a/.github/workflows/unittest-macos-cpu.yml
+++ b/.github/workflows/unittest-macos-cpu.yml
@@ -53,14 +53,10 @@ jobs:
 
         set -euxo pipefail
 
-        echo '::group::Setup Conda Environment'
-        ./.circleci/unittest/linux/scripts/setup_env.sh
-        echo '::endgroup::'
-
         echo '::group::Install PyTorch and Torchaudio'
-        ./.circleci/unittest/linux/scripts/install.sh
+        ./.github/scripts/unittest-linux/install.sh
         echo '::endgroup::'
 
         echo '::group::Run Tests'
-        ./.circleci/unittest/linux/scripts/run_test.sh
+        ./.github/scripts/unittest-linux/run_test.sh
         echo '::endgroup::'
diff --git a/.github/workflows/unittest-windows-cpu.yml b/.github/workflows/unittest-windows-cpu.yml
new file mode 100644
index 0000000000..8c4fee312d
--- /dev/null
+++ b/.github/workflows/unittest-windows-cpu.yml
@@ -0,0 +1,52 @@
+name: Unittests on Windows CPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  unittests-windows-cpu:
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/audio
+      runner: windows.4xlarge
+      timeout: 180
+      script: |
+        # Mark Build Directory Safe
+        git config --global --add safe.directory /__w/audio/audio
+
+        # Set up Environment Variables
+        export PYTHON_VERSION="3.8"
+        export USE_FFMPEG="1"
+        unset CUDA_VERSION
+
+        # Set CHANNEL
+        if [[(${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          export UPLOAD_CHANNEL=test
+        else
+          export UPLOAD_CHANNEL=nightly
+        fi
+
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CUDA=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_HW_ACCEL=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS=true
+
+        .github/scripts/unittest-windows/setup_env.sh
+        .github/scripts/unittest-windows/install.sh
+        .github/scripts/unittest-windows/run_test.sh
diff --git a/.github/workflows/unittest-windows-gpu.yml b/.github/workflows/unittest-windows-gpu.yml
new file mode 100644
index 0000000000..4b8b997e0a
--- /dev/null
+++ b/.github/workflows/unittest-windows-gpu.yml
@@ -0,0 +1,56 @@
+name: Unittests on Windows GPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  unittests-windows-gpu:
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      repository: pytorch/audio
+      runner: windows.8xlarge.nvidia.gpu
+      timeout: 360
+      script: |
+        # Mark Build Directory Safe
+        git config --global --add safe.directory /__w/audio/audio
+
+        # Set up Environment Variables
+        export PYTHON_VERSION="3.8"
+        export USE_FFMPEG="1"
+        export CUDA_VERSION=11.8
+
+        # Set CHANNEL
+        if [[(${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+          export UPLOAD_CHANNEL=test
+        else
+          export UPLOAD_CHANNEL=nightly
+        fi
+
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_APPLY_CMVN_SLIDING=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_FBANK_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_KALDI_PITCH_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_MFCC_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_COMPUTE_SPECTROGRAM_FEATS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_CMD_SOX=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_KALDI=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_SOX=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_ON_PYTHON_310=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MOD_sentencepiece=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_AUDIO_OUT_DEVICE=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_MACOS=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_CUDA_SMALL_MEMORY=true
+        export TORCHAUDIO_TEST_ALLOW_SKIP_IF_TEMPORARY_DISABLED=true
+
+
+        .github/scripts/unittest-windows/setup_env.sh
+        ./packaging/windows/internal/cuda_install.bat
+        ./packaging/windows/internal/driver_update.bat
+
+        .github/scripts/unittest-windows/install.sh
+        .github/scripts/unittest-windows/run_test.sh
diff --git a/.gitmodules b/.gitmodules
index 724846120c..e69de29bb2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +0,0 @@
-[submodule "kaldi"]
-	path = third_party/kaldi/submodule
-	url = https://github.com/kaldi-asr/kaldi
-	ignore = dirty
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cf9a6d523f..d8c4096aec 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_language_version:
+  node: 16.14.2
+
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.0.1
@@ -16,4 +19,28 @@ repos:
           - black == 22.3
           - usort == 1.0.2
           - libcst == 0.4.1
-        exclude: examples
+
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v11.0.1
+    hooks:
+      - id: clang-format
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        args: ['torchaudio', 'test', 'tools', 'docs/source/conf.py', 'examples']
+        exclude: 'build|docs/src|third_party'
+        additional_dependencies:
+          - flake8-breakpoint == 1.1.0
+          - flake8-bugbear == 22.6.22
+          - flake8-comprehensions == 3.10.0
+          - flake8-pyi == 22.5.1
+          - mccabe == 0.6.0
+          - pycodestyle == 2.8.0
+
+  - repo: https://github.com/pycqa/pydocstyle
+    rev: 6.3.0
+    hooks:
+      - id: pydocstyle
+        exclude: 'build|test|examples|third_party|docs|tools'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab70cff60c..026307553a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,6 @@ endif()
 
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
-option(BUILD_KALDI "Build kaldi statically" ON)
 option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_ALIGN "Enable forced alignment" ON)
@@ -166,12 +165,13 @@ else()
   message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
 endif()
 
-add_subdirectory(third_party)
 add_subdirectory(torchaudio/csrc)
 if (BUILD_SOX)
+  add_subdirectory(third_party/sox)
   add_subdirectory(torchaudio/csrc/sox)
 endif()
 if (USE_FFMPEG)
+  add_subdirectory(third_party/ffmpeg)
   add_subdirectory(torchaudio/csrc/ffmpeg)
 endif()
 if (BUILD_CUDA_CTC_DECODER)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 125aa83847..3e615a799e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -74,7 +74,6 @@ python setup.py develop
 Some environmnet variables that change the build behavior
 - `BUILD_SOX`: Deteremines whether build and bind libsox in non-Windows environments. (no effect in Windows as libsox integration is not available) Default value is 1 (build and bind). Use 0 for disabling it.
 - `USE_CUDA`: Determines whether build the custom CUDA kernel. Default to the availability of CUDA-compatible GPUs.
-- `BUILD_KALDI`: Determines whether build Kaldi extension. This is required for `kaldi_pitch` function. Default value is 1 on Linux/macOS and 0 on Windows.
 - `BUILD_RNNT`: Determines whether build RNN-T loss function. Default value is 1.
 - `BUILD_CUDA_CTC_DECODER`: Determines whether build decoder features based on CUDA CTC decoder. Default value is 1. (`USE_CUDA` has to be 1.)
 
diff --git a/README.md b/README.md
index 414b75bca4..019359752d 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 torchaudio: an audio library for PyTorch
 ========================================
 
-[![Build Status](https://circleci.com/gh/pytorch/audio.svg?style=svg)](https://app.circleci.com/pipelines/github/pytorch/audio)
 [![Documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchaudio%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/audio/main/)
 [![Anaconda Badge](https://anaconda.org/pytorch/torchaudio/badges/downloads.svg)](https://anaconda.org/pytorch/torchaudio)
 [![Anaconda-Server Badge](https://anaconda.org/pytorch/torchaudio/badges/platforms.svg)](https://anaconda.org/pytorch/torchaudio)
@@ -21,10 +20,12 @@ to use and feel like a natural extension.
   - Load a variety of audio formats, such as `wav`, `mp3`, `ogg`, `flac`, `opus`, `sphere`, into a torch Tensor using SoX
   - [Kaldi (ark/scp)](http://pytorch.org/audio/main/kaldi_io.html)
 - [Dataloaders for common audio datasets](http://pytorch.org/audio/main/datasets.html)
+- Audio and speech processing functions
+  - [forced_align](https://pytorch.org/audio/main/generated/torchaudio.functional.forced_align.html)
 - Common audio transforms
-    - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
+  - [Spectrogram, AmplitudeToDB, MelScale, MelSpectrogram, MFCC, MuLawEncoding, MuLawDecoding, Resample](http://pytorch.org/audio/main/transforms.html)
 - Compliance interfaces: Run code using PyTorch that align with other libraries
-    - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
+  - [Kaldi: spectrogram, fbank, mfcc](https://pytorch.org/audio/main/compliance.kaldi.html)
 
 Installation
 ------------
diff --git a/cmake/FindFFMPEG.cmake b/cmake/FindFFMPEG.cmake
deleted file mode 100644
index ad74dd8f14..0000000000
--- a/cmake/FindFFMPEG.cmake
+++ /dev/null
@@ -1,264 +0,0 @@
-#[==[
-
-Originally taken from: https://github.com/Kitware/VTK/blob/8485477f9aa41f3c33094c3beb201e747abf5541/CMake/FindFFMPEG.cmake
-License: https://github.com/Kitware/VTK/blob/8485477f9aa41f3c33094c3beb201e747abf5541/Copyright.txt
-
-/*=========================================================================
-
-  Program:   Visualization Toolkit
-  Module:    Copyright.txt
-
-Copyright (c) 1993-2015 Ken Martin, Will Schroeder, Bill Lorensen
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
-   this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither name of Ken Martin, Will Schroeder, or Bill Lorensen nor the names
-   of any contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-=========================================================================*/
-
-=========================================================================
- 
-Provides the following variables:
-
-  * `FFMPEG_INCLUDE_DIRS`: Include directories necessary to use FFMPEG.
-  * `FFMPEG_LIBRARIES`: Libraries necessary to use FFMPEG. Note that this only
-    includes libraries for the components requested.
-  * `FFMPEG_VERSION`: The version of FFMPEG found.
-
-The following components are supported:
-
-  * `avcodec`
-  * `avdevice`
-  * `avfilter`
-  * `avformat`
-  * `avutil`
-  * `swresample`
-  * `swscale`
-
-For each component, the following are provided:
-
-  * `FFMPEG_<component>_FOUND`: Libraries for the component.
-  * `FFMPEG_<component>_INCLUDE_DIRS`: Include directories for
-    the component.
-  * `FFMPEG_<component>_LIBRARIES`: Libraries for the component.
-  * `FFMPEG::<component>`: A target to use with `target_link_libraries`.
-
-Note that only components requested with `COMPONENTS` or `OPTIONAL_COMPONENTS`
-are guaranteed to set these variables or provide targets.
-#]==]
-
-function (_ffmpeg_find component headername)
-  find_path("FFMPEG_${component}_INCLUDE_DIR"
-    NAMES
-      "lib${component}/${headername}"
-    PATHS
-      "$ENV{FFMPEG_ROOT}/include"
-    PATH_SUFFIXES
-      ffmpeg
-    DOC "FFMPEG's ${component} include directory"
-    NO_DEFAULT_PATH)
-  find_path("FFMPEG_${component}_INCLUDE_DIR"
-    NAMES
-      "lib${component}/${headername}"
-    PATHS
-      "$ENV{CONDA_PREFIX}/include"
-    PATH_SUFFIXES
-      ffmpeg
-    DOC "FFMPEG's ${component} include directory"
-    NO_DEFAULT_PATH)
-  find_path("FFMPEG_${component}_INCLUDE_DIR"
-    NAMES
-      "lib${component}/${headername}"
-    PATHS
-      ~/Library/Frameworks
-      /Library/Frameworks
-      /usr/local/include
-      /usr/include
-      /sw/include # Fink
-      /opt/local/include # DarwinPorts
-      /opt/csw/include # Blastwave
-      /opt/include
-      /usr/freeware/include
-    PATH_SUFFIXES
-      ffmpeg
-    DOC "FFMPEG's ${component} include directory")
-  mark_as_advanced("FFMPEG_${component}_INCLUDE_DIR")
-
-  # On Windows, static FFMPEG is sometimes built as `lib<name>.a`.
-  if (WIN32)
-    list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".a" ".lib")
-    list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib")
-  endif ()
-
-  find_library("FFMPEG_${component}_LIBRARY"
-    NAMES
-      "${component}"
-    PATHS
-      "$ENV{FFMPEG_ROOT}/lib"
-      "$ENV{FFMPEG_ROOT}/bin"
-    DOC "FFMPEG's ${component} library"
-    NO_DEFAULT_PATH)
-  find_library("FFMPEG_${component}_LIBRARY"
-    NAMES
-      "${component}"
-    PATHS
-      "$ENV{CONDA_PREFIX}/lib"
-      "$ENV{CONDA_PREFIX}/bin"
-    DOC "FFMPEG's ${component} library"
-    NO_DEFAULT_PATH)
-  find_library("FFMPEG_${component}_LIBRARY"
-    NAMES
-      "${component}"
-    PATHS
-      ~/Library/Frameworks
-      /Library/Frameworks
-      /usr/local/lib
-      /usr/local/lib64
-      /usr/lib
-      /usr/lib64
-      /sw/lib
-      /opt/local/lib
-      /opt/csw/lib
-      /opt/lib
-      /usr/freeware/lib64
-    DOC "FFMPEG's ${component} library")
-  mark_as_advanced("FFMPEG_${component}_LIBRARY")
-
-  if (FFMPEG_${component}_LIBRARY AND FFMPEG_${component}_INCLUDE_DIR)
-    set(_deps_found TRUE)
-    set(_deps_link)
-    foreach (_ffmpeg_dep IN LISTS ARGN)
-      if (TARGET "FFMPEG::${_ffmpeg_dep}")
-        list(APPEND _deps_link "FFMPEG::${_ffmpeg_dep}")
-      else ()
-        set(_deps_found FALSE)
-      endif ()
-    endforeach ()
-    if (_deps_found)
-      if (NOT TARGET "FFMPEG::${component}")
-        add_library("FFMPEG::${component}" UNKNOWN IMPORTED)
-        set_target_properties("FFMPEG::${component}" PROPERTIES
-          IMPORTED_LOCATION "${FFMPEG_${component}_LIBRARY}"
-          INTERFACE_INCLUDE_DIRECTORIES "${FFMPEG_${component}_INCLUDE_DIR}"
-          IMPORTED_LINK_INTERFACE_LIBRARIES "${_deps_link}")
-      endif ()
-      set("FFMPEG_${component}_FOUND" 1
-        PARENT_SCOPE)
-
-      set(version_header_path "${FFMPEG_${component}_INCLUDE_DIR}/lib${component}/version.h")
-      if (EXISTS "${version_header_path}")
-        string(TOUPPER "${component}" component_upper)
-        file(STRINGS "${version_header_path}" version
-          REGEX "#define  *LIB${component_upper}_VERSION_(MAJOR|MINOR|MICRO) ")
-        string(REGEX REPLACE ".*_MAJOR *\([0-9]*\).*" "\\1" major "${version}")
-        string(REGEX REPLACE ".*_MINOR *\([0-9]*\).*" "\\1" minor "${version}")
-        string(REGEX REPLACE ".*_MICRO *\([0-9]*\).*" "\\1" micro "${version}")
-        if (NOT major STREQUAL "" AND
-            NOT minor STREQUAL "" AND
-            NOT micro STREQUAL "")
-          set("FFMPEG_${component}_VERSION" "${major}.${minor}.${micro}"
-            PARENT_SCOPE)
-        endif ()
-      endif ()
-    else ()
-      set("FFMPEG_${component}_FOUND" 0
-        PARENT_SCOPE)
-      set(what)
-      if (NOT FFMPEG_${component}_LIBRARY)
-        set(what "library")
-      endif ()
-      if (NOT FFMPEG_${component}_INCLUDE_DIR)
-        if (what)
-          string(APPEND what " or headers")
-        else ()
-          set(what "headers")
-        endif ()
-      endif ()
-      set("FFMPEG_${component}_NOT_FOUND_MESSAGE"
-        "Could not find the ${what} for ${component}."
-        PARENT_SCOPE)
-    endif ()
-  endif ()
-endfunction ()
-
-_ffmpeg_find(avutil     avutil.h)
-_ffmpeg_find(swresample swresample.h
-  avutil)
-_ffmpeg_find(swscale    swscale.h
-  avutil)
-_ffmpeg_find(avcodec    avcodec.h
-  avutil)
-_ffmpeg_find(avformat   avformat.h
-  avcodec avutil)
-_ffmpeg_find(avfilter   avfilter.h
-  avutil)
-_ffmpeg_find(avdevice   avdevice.h
-  avformat avutil)
-
-if (TARGET FFMPEG::avutil)
-  set(_ffmpeg_version_header_path "${FFMPEG_avutil_INCLUDE_DIR}/libavutil/ffversion.h")
-  if (EXISTS "${_ffmpeg_version_header_path}")
-    file(STRINGS "${_ffmpeg_version_header_path}" _ffmpeg_version
-      REGEX "FFMPEG_VERSION")
-    string(REGEX REPLACE ".*\"n?\(.*\)\"" "\\1" FFMPEG_VERSION "${_ffmpeg_version}")
-    unset(_ffmpeg_version)
-  else ()
-    set(FFMPEG_VERSION FFMPEG_VERSION-NOTFOUND)
-  endif ()
-  unset(_ffmpeg_version_header_path)
-endif ()
-
-set(FFMPEG_INCLUDE_DIRS)
-set(FFMPEG_LIBRARIES)
-set(_ffmpeg_required_vars)
-foreach (_ffmpeg_component IN LISTS FFMPEG_FIND_COMPONENTS)
-  if (TARGET "FFMPEG::${_ffmpeg_component}")
-    set(FFMPEG_${_ffmpeg_component}_INCLUDE_DIRS
-      "${FFMPEG_${_ffmpeg_component}_INCLUDE_DIR}")
-    set(FFMPEG_${_ffmpeg_component}_LIBRARIES
-      "${FFMPEG_${_ffmpeg_component}_LIBRARY}")
-    list(APPEND FFMPEG_INCLUDE_DIRS
-      "${FFMPEG_${_ffmpeg_component}_INCLUDE_DIRS}")
-    list(APPEND FFMPEG_LIBRARIES
-      "${FFMPEG_${_ffmpeg_component}_LIBRARIES}")
-    if (FFMEG_FIND_REQUIRED_${_ffmpeg_component})
-      list(APPEND _ffmpeg_required_vars
-        "FFMPEG_${_ffmpeg_required_vars}_INCLUDE_DIRS"
-        "FFMPEG_${_ffmpeg_required_vars}_LIBRARIES")
-    endif ()
-  endif ()
-endforeach ()
-unset(_ffmpeg_component)
-
-if (FFMPEG_INCLUDE_DIRS)
-  list(REMOVE_DUPLICATES FFMPEG_INCLUDE_DIRS)
-endif ()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(FFMPEG
-  REQUIRED_VARS FFMPEG_INCLUDE_DIRS FFMPEG_LIBRARIES ${_ffmpeg_required_vars}
-  VERSION_VAR FFMPEG_VERSION
-  HANDLE_COMPONENTS)
-unset(_ffmpeg_required_vars)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 987cac93ce..c19efb44e2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,6 +25,7 @@
 
 sys.path.insert(0, os.path.abspath("."))
 
+
 import pytorch_sphinx_theme
 
 # -- General configuration ------------------------------------------------
diff --git a/docs/source/functional.rst b/docs/source/functional.rst
index fb9bea9cd7..8be4492648 100644
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -80,7 +80,6 @@ Feature Extractions
    compute_deltas
    detect_pitch_frequency
    sliding_window_cmn
-   compute_kaldi_pitch
    spectral_centroid
 
 Multi-channel
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e393a1dd00..e1d64b2241 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -51,6 +51,7 @@ model implementations and application components.
    tutorials/audio_data_augmentation_tutorial
    tutorials/audio_feature_extractions_tutorial
    tutorials/audio_feature_augmentation_tutorial
+   tutorials/ctc_forced_alignment_api_tutorial
 
    tutorials/oscillator_tutorial
    tutorials/additive_synthesis_tutorial
@@ -68,7 +69,7 @@ model implementations and application components.
    tutorials/asr_inference_with_ctc_decoder_tutorial
    tutorials/online_asr_tutorial
    tutorials/device_asr
-   tutorials/ctc_forced_alignment_api_tutorial
+   tutorials/forced_alignment_for_multilingual_data_tutorial
    tutorials/forced_alignment_tutorial
    tutorials/tacotron2_pipeline_tutorial
    tutorials/mvdr_tutorial
@@ -84,6 +85,7 @@ model implementations and application components.
    Emformer RNN-T ASR <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>
    Conv-TasNet Source Separation <https://github.com/pytorch/audio/tree/main/examples/source_separation>
    HuBERT Pre-training and Fine-tuning (ASR) <https://github.com/pytorch/audio/tree/main/examples/hubert>
+   Conformer/Emformer RNN-T ASR/VSR/AV-ASR <https://github.com/pytorch/audio/tree/main/examples/asr/avsr_rnnt>
 
 .. toctree::
    :maxdepth: 1
@@ -157,6 +159,13 @@ Tutorials
    :link: tutorials/ctc_forced_alignment_api_tutorial.html
    :tags: CTC,Forced-Alignment
 
+.. customcarditem::
+   :header: Forced alignment for multilingual data
+   :card_description: Learn how to use align multiligual data using TorchAudio's CTC forced alignment API (<code>torchaudio.functional.forced_align</code>) and a multiligual Wav2Vec2 model.
+   :image: https://download.pytorch.org/torchaudio/tutorial-assets/thumbnails/forced_alignment_for_multilingual_data_tutorial.png
+   :link: tutorials/forced_alignment_for_multilingual_data_tutorial.html
+   :tags: Forced-Alignment
+
 .. customcarditem::
    :header: Streaming media decoding with StreamReader
    :card_description: Learn how to load audio/video to Tensors using <code>torchaudio.io.StreamReader</code> class.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 20c2d4c7d7..dac1a40864 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -16,7 +16,7 @@ Please refer to https://pytorch.org/get-started/locally/ for the details.
    each of which requires a corresponding PyTorch distribution.
 
 .. note::
-   This software was compiled against an unmodified copy of FFmpeg (licensed under `the LGPLv2.1 <https://github.com/FFmpeg/FFmpeg/blob/0e15444aceca0e78f99f3d67758eb79d11b86599/COPYING.LGPLv2.1>`_), with the specific rpath removed so as to enable the use of system libraries. The LGPL source can be downloaded `here <https://github.com/FFmpeg/FFmpeg/releases/tag/n5.0.3>`_.
+   This software was compiled against an unmodified copy of FFmpeg (licensed under `the LGPLv2.1 <https://github.com/FFmpeg/FFmpeg/blob/a5d2008e2a2360d351798e9abe883d603e231442/COPYING.LGPLv2.1>`_), with the specific rpath removed so as to enable the use of system libraries. The LGPL source can be downloaded `here <https://github.com/FFmpeg/FFmpeg/releases/tag/n4.1.8>`_.
 
 Dependencies
 ------------
@@ -31,8 +31,8 @@ Optional Dependencies
 * `FFmpeg <https://ffmpeg.org>`_.
 
   Required to use :py:mod:`torchaudio.io` module.
-  TorchAudio official binary distributions are compatible with FFmpeg 5.
-  If you need to use FFmpeg 6, please build TorchAudio from source.
+  TorchAudio official binary distributions are compatible with FFmpeg 4.1 to 4.4.
+  If you need to use FFmpeg 5, please build TorchAudio from source.
 
 * `sentencepiece <https://pypi.org/project/sentencepiece/>`_
 
diff --git a/docs/source/libtorchaudio.stream_reader.rst b/docs/source/libtorchaudio.stream_reader.rst
index 0f2b795e7d..36cee80dcb 100644
--- a/docs/source/libtorchaudio.stream_reader.rst
+++ b/docs/source/libtorchaudio.stream_reader.rst
@@ -5,13 +5,28 @@
 torchaudio::io::StreamReader
 ============================
 
-.. doxygenclass:: torchaudio::io::StreamReader
+``StreamReader`` is the implementation used by Python equivalent and provides similar interface.
+When working with custom I/O, such as in-memory data, ``StreamReaderCustomIO`` class can be used.
+
+Both classes have the same methods defined, so their usages are the same.
 
 Constructors
 ------------
 
+StreamReader
+^^^^^^^^^^^^
+
+.. doxygenclass:: torchaudio::io::StreamReader
+
 .. doxygenfunction:: torchaudio::io::StreamReader::StreamReader(const std::string &src, const c10::optional<std::string> &format = {}, const c10::optional<OptionDict> &option = {})
 
+StreamReaderCustomIO
+^^^^^^^^^^^^^^^^^^^^
+
+.. doxygenclass:: torchaudio::io::StreamReaderCustomIO
+
+.. doxygenfunction:: torchaudio::io::StreamReaderCustomIO::StreamReaderCustomIO
+
 Query Methods
 -------------
 
diff --git a/docs/source/libtorchaudio.stream_writer.rst b/docs/source/libtorchaudio.stream_writer.rst
index ad2ca2ecba..00e469dfbe 100644
--- a/docs/source/libtorchaudio.stream_writer.rst
+++ b/docs/source/libtorchaudio.stream_writer.rst
@@ -5,13 +5,28 @@
 torchaudio::io::StreamWriter
 ============================
 
-.. doxygenclass:: torchaudio::io::StreamWriter
+``StreamWriter`` is the implementation used by Python equivalent and provides similar interface.
+When working with custom I/O, such as in-memory data, ``StreamWriterCustomIO`` class can be used.
+
+Both classes have the same methods defined, so their usages are the same.
 
 Constructors
 ------------
 
+StreamWriter
+^^^^^^^^^^^^
+
+.. doxygenclass:: torchaudio::io::StreamWriter
+
 .. doxygenfunction:: torchaudio::io::StreamWriter::StreamWriter(const std::string &dst, const c10::optional<std::string> &format = {})
 
+StreamWriterCustomIO
+^^^^^^^^^^^^^^^^^^^^
+
+.. doxygenclass:: torchaudio::io::StreamWriterCustomIO
+
+.. doxygenfunction:: torchaudio::io::StreamWriterCustomIO::StreamWriterCustomIO
+
 Config methods
 --------------
 
diff --git a/docs/source/prototype.functional.rst b/docs/source/prototype.functional.rst
index fd0f006af6..72f390c71a 100644
--- a/docs/source/prototype.functional.rst
+++ b/docs/source/prototype.functional.rst
@@ -4,10 +4,15 @@ torchaudio.prototype.functional
 .. py:module:: torchaudio.prototype.functional
 .. currentmodule:: torchaudio.prototype.functional
 
-barkscale_fbanks
-~~~~~~~~~~~~~~~~
+Utility
+~~~~~~~
 
-.. autofunction:: barkscale_fbanks
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   barkscale_fbanks
+   chroma_filterbank
 
 DSP
 ~~~
diff --git a/docs/source/prototype.transforms.rst b/docs/source/prototype.transforms.rst
index b48bdc8d5a..67737ab7d1 100644
--- a/docs/source/prototype.transforms.rst
+++ b/docs/source/prototype.transforms.rst
@@ -10,5 +10,7 @@ torchaudio.prototype.transforms
     :nosignatures:
 
     BarkScale
-    InverseBarkScale
     BarkSpectrogram
+    ChromaScale
+    ChromaSpectrogram
+    InverseBarkScale
diff --git a/examples/asr/avsr_rnnt/README.md b/examples/asr/avsr_rnnt/README.md
new file mode 100644
index 0000000000..88e26d042c
--- /dev/null
+++ b/examples/asr/avsr_rnnt/README.md
@@ -0,0 +1,70 @@
+<p align="center"><img width="160" src="doc/lip_white.png" alt="logo"></p>
+<h1 align="center">RNN-T ASR/VSR/AV-ASR Examples</h1>
+
+This repository contains sample implementations of training and evaluation pipelines for RNNT based automatic, visual, and audio-visual (ASR, VSR, AV-ASR) models on LRS3. This repository includes both streaming/non-streaming modes. We follow the same training pipeline as [AutoAVSR](https://arxiv.org/abs/2303.14307).
+
+## Preparation
+1. Setup the environment.
+```
+conda create -y -n autoavsr python=3.8
+conda activate autoavsr
+```
+
+2. Install PyTorch nightly version (Pytorch, Torchvision, Torchaudio) from [source](https://pytorch.org/get-started/), along with all necessary packages:
+
+```Shell
+pip install pytorch-lightning sentencepiece
+```
+
+3. Preprocess LRS3 to a cropped-face dataset from the [data_prep](./data_prep) folder.
+
+4. `[sp_model_path]` is a sentencepiece model to encode targets, which can be generated using `train_spm.py`.
+
+### Training ASR or VSR model
+
+- `[root_dir]` is the root directory for the LRS3 cropped-face dataset.
+- `[modality]` is the input modality type, including `v`, `a`, and `av`.
+- `[mode]` is the model type, including `online` and `offline`.
+
+
+```Shell
+
+python train.py --root-dir [root_dir] \
+                --sp-model-path ./spm_unigram_1023.model
+                --exp-dir ./exp \
+                --num-nodes 8 \
+                --gpus 8 \
+                --md [modality] \
+                --mode [mode]
+```
+
+### Training AV-ASR model
+
+```Shell
+python train.py --root-dir [root-dir] \
+                --sp-model-path ./spm_unigram_1023.model
+                --exp-dir ./exp \
+                --num-nodes 8 \
+                --gpus 8 \
+                --md av \
+                --mode [mode]
+```
+
+### Evaluating models
+
+```Shell
+python eval.py --dataset-path [dataset_path] \
+               --sp-model-path ./spm_unigram_1023.model
+               --md [modality] \
+               --mode [mode] \
+               --checkpoint-path [checkpoint_path]
+```
+
+The table below contains WER for AV-ASR models [offline evaluation].
+
+|    Model    |    WER [%]   |   Params (M)   |
+|:-----------:|:------------:|:--------------:|
+| Non-streaming models       |                |
+|    AV-ASR   |      4.0     |       50       |
+| Streaming models           |                |
+|    AV-ASR   |      4.3     |       40       |
diff --git a/examples/asr/avsr_rnnt/average_checkpoints.py b/examples/asr/avsr_rnnt/average_checkpoints.py
new file mode 100644
index 0000000000..74cf20f959
--- /dev/null
+++ b/examples/asr/avsr_rnnt/average_checkpoints.py
@@ -0,0 +1,31 @@
+import os
+
+import torch
+
+
+def average_checkpoints(last):
+    avg = None
+    for path in last:
+        states = torch.load(path, map_location=lambda storage, loc: storage)["state_dict"]
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            if avg[k].is_floating_point():
+                avg[k] /= len(last)
+            else:
+                avg[k] //= len(last)
+    return avg
+
+
+def ensemble(args):
+    last = [
+        os.path.join(args.exp_dir, args.experiment_name, f"epoch={n}.ckpt")
+        for n in range(args.epochs - 10, args.epochs)
+    ]
+    model_path = os.path.join(args.exp_dir, args.experiment_name, "model_avg_10.pth")
+    torch.save({"state_dict": average_checkpoints(last)}, model_path)
diff --git a/examples/asr/avsr_rnnt/data_module.py b/examples/asr/avsr_rnnt/data_module.py
new file mode 100644
index 0000000000..060fde60f0
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_module.py
@@ -0,0 +1,167 @@
+import random
+
+import torch
+
+from lrs3 import LRS3
+from pytorch_lightning import LightningDataModule
+
+
+def _batch_by_token_count(idx_target_lengths, max_frames, batch_size=None):
+    batches = []
+    current_batch = []
+    current_token_count = 0
+    for idx, target_length in idx_target_lengths:
+        if current_token_count + target_length > max_frames or (batch_size and len(current_batch) == batch_size):
+            batches.append(current_batch)
+            current_batch = [idx]
+            current_token_count = target_length
+        else:
+            current_batch.append(idx)
+            current_token_count += target_length
+
+    if current_batch:
+        batches.append(current_batch)
+
+    return batches
+
+
+class CustomBucketDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        dataset,
+        lengths,
+        max_frames,
+        num_buckets,
+        shuffle=False,
+        batch_size=None,
+    ):
+        super().__init__()
+
+        assert len(dataset) == len(lengths)
+
+        self.dataset = dataset
+
+        max_length = max(lengths)
+        min_length = min(lengths)
+
+        assert max_frames >= max_length
+
+        buckets = torch.linspace(min_length, max_length, num_buckets)
+        lengths = torch.tensor(lengths)
+        bucket_assignments = torch.bucketize(lengths, buckets)
+
+        idx_length_buckets = [(idx, length, bucket_assignments[idx]) for idx, length in enumerate(lengths)]
+        if shuffle:
+            idx_length_buckets = random.sample(idx_length_buckets, len(idx_length_buckets))
+        else:
+            idx_length_buckets = sorted(idx_length_buckets, key=lambda x: x[1], reverse=True)
+
+        sorted_idx_length_buckets = sorted(idx_length_buckets, key=lambda x: x[2])
+        self.batches = _batch_by_token_count(
+            [(idx, length) for idx, length, _ in sorted_idx_length_buckets],
+            max_frames,
+            batch_size=batch_size,
+        )
+
+    def __getitem__(self, idx):
+        return [self.dataset[subidx] for subidx in self.batches[idx]]
+
+    def __len__(self):
+        return len(self.batches)
+
+
+class TransformDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset, transform_fn):
+        self.dataset = dataset
+        self.transform_fn = transform_fn
+
+    def __getitem__(self, idx):
+        return self.transform_fn(self.dataset[idx])
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class LRS3DataModule(LightningDataModule):
+    def __init__(
+        self,
+        *,
+        args,
+        train_transform,
+        val_transform,
+        test_transform,
+        max_frames,
+        batch_size=None,
+        train_num_buckets=50,
+        train_shuffle=True,
+        num_workers=10,
+    ):
+        super().__init__()
+        self.args = args
+        self.train_dataset_lengths = None
+        self.val_dataset_lengths = None
+        self.train_transform = train_transform
+        self.val_transform = val_transform
+        self.test_transform = test_transform
+        self.max_frames = max_frames
+        self.batch_size = batch_size
+        self.train_num_buckets = train_num_buckets
+        self.train_shuffle = train_shuffle
+        self.num_workers = num_workers
+
+    def train_dataloader(self):
+        datasets = [LRS3(self.args, subset="train")]
+
+        if not self.train_dataset_lengths:
+            self.train_dataset_lengths = [dataset._lengthlist for dataset in datasets]
+
+        dataset = torch.utils.data.ConcatDataset(
+            [
+                CustomBucketDataset(
+                    dataset,
+                    lengths,
+                    self.max_frames,
+                    self.train_num_buckets,
+                    batch_size=self.batch_size,
+                )
+                for dataset, lengths in zip(datasets, self.train_dataset_lengths)
+            ]
+        )
+
+        dataset = TransformDataset(dataset, self.train_transform)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            batch_size=None,
+            shuffle=self.train_shuffle,
+        )
+        return dataloader
+
+    def val_dataloader(self):
+        datasets = [LRS3(self.args, subset="val")]
+
+        if not self.val_dataset_lengths:
+            self.val_dataset_lengths = [dataset._lengthlist for dataset in datasets]
+
+        dataset = torch.utils.data.ConcatDataset(
+            [
+                CustomBucketDataset(
+                    dataset,
+                    lengths,
+                    self.max_frames,
+                    1,
+                    batch_size=self.batch_size,
+                )
+                for dataset, lengths in zip(datasets, self.val_dataset_lengths)
+            ]
+        )
+
+        dataset = TransformDataset(dataset, self.val_transform)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=self.num_workers)
+        return dataloader
+
+    def test_dataloader(self):
+        dataset = LRS3(self.args, subset="test")
+        dataset = TransformDataset(dataset, self.test_transform)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=None)
+        return dataloader
diff --git a/examples/asr/avsr_rnnt/data_prep/README.md b/examples/asr/avsr_rnnt/data_prep/README.md
new file mode 100644
index 0000000000..995f9ec5be
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/README.md
@@ -0,0 +1,48 @@
+
+# Preprocessing LRS3
+
+We provide a pre-processing pipeline to detect and crop full-face images in this repository.
+
+## Prerequisites
+
+Install all dependency-packages.
+
+```Shell
+pip install -r requirements.txt
+```
+
+Install [RetinaFace](./tools) tracker.
+
+## Preprocessing
+
+### Step 1. Pre-process the LRS3 dataset.
+Please run the following script to pre-process the LRS3 dataset:
+
+```Shell
+python main.py \
+    --data-dir=[data_dir] \
+    --dataset=[dataset] \
+    --root=[root] \
+    --folder=[folder] \
+    --groups=[num_groups] \
+    --job-index=[job_index]
+```
+
+- `[data_dir]` and `[landmarks_dir]` are the directories for original dataset and corresponding landmarks.
+
+- `[root]` is the directory for saved cropped-face dataset.
+
+- `[folder]` can be set to  `train` or `test`.
+
+- `[num_groups]` and `[job-index]` are used to split the dataset into multiple threads, where `[job-index]` is an integer in [0, `[num_groups]`).
+
+### Step 2. Merge the label list.
+After completing Step 2, run the following script to merge all labels.
+
+```Shell
+python merge.py \
+    --dataset=[dataset] \
+    --root=[root] \
+    --folder=[folder] \
+    --groups=[num_groups] \
+```
diff --git a/examples/asr/avsr_rnnt/data_prep/data/data_module.py b/examples/asr/avsr_rnnt/data_prep/data/data_module.py
new file mode 100644
index 0000000000..556a1bf369
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/data/data_module.py
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import torch
+import torchaudio
+import torchvision
+
+
+class AVSRDataLoader:
+    def __init__(self, modality, detector="retinaface", resize=None):
+        self.modality = modality
+        if modality == "video":
+            if detector == "retinaface":
+                from detectors.retinaface.detector import LandmarksDetector
+                from detectors.retinaface.video_process import VideoProcess
+
+                self.landmarks_detector = LandmarksDetector(device="cuda:0")
+                self.video_process = VideoProcess(resize=resize)
+
+    def load_data(self, data_filename, transform=True):
+        if self.modality == "audio":
+            audio, sample_rate = self.load_audio(data_filename)
+            audio = self.audio_process(audio, sample_rate)
+            return audio
+        if self.modality == "video":
+            landmarks = self.landmarks_detector(data_filename)
+            video = self.load_video(data_filename)
+            video = self.video_process(video, landmarks)
+            video = torch.tensor(video)
+            return video
+
+    def load_audio(self, data_filename):
+        waveform, sample_rate = torchaudio.load(data_filename, normalize=True)
+        return waveform, sample_rate
+
+    def load_video(self, data_filename):
+        return torchvision.io.read_video(data_filename, pts_unit="sec")[0].numpy()
+
+    def audio_process(self, waveform, sample_rate, target_sample_rate=16000):
+        if sample_rate != target_sample_rate:
+            waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+        return waveform
diff --git a/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/detector.py b/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/detector.py
new file mode 100644
index 0000000000..2044627045
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/detector.py
@@ -0,0 +1,31 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import warnings
+
+import numpy as np
+import torchvision
+from ibug.face_detection import RetinaFacePredictor
+
+warnings.filterwarnings("ignore")
+
+
+class LandmarksDetector:
+    def __init__(self, device="cuda:0", model_name="resnet50"):
+        self.face_detector = RetinaFacePredictor(
+            device=device, threshold=0.8, model=RetinaFacePredictor.get_model(model_name)
+        )
+
+    def __call__(self, filename):
+        video_frames = torchvision.io.read_video(filename, pts_unit="sec")[0].numpy()
+        landmarks = []
+        for frame in video_frames:
+            detected_faces = self.face_detector(frame, rgb=False)
+            if len(detected_faces) >= 1:
+                landmarks.append(np.reshape(detected_faces[0][:4], (2, 2)))
+            else:
+                landmarks.append(None)
+        return landmarks
diff --git a/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/video_process.py b/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/video_process.py
new file mode 100644
index 0000000000..375fd4c428
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/detectors/retinaface/video_process.py
@@ -0,0 +1,158 @@
+import cv2
+import numpy as np
+from skimage import transform as tf
+
+
+def linear_interpolate(landmarks, start_idx, stop_idx):
+    start_landmarks = landmarks[start_idx]
+    stop_landmarks = landmarks[stop_idx]
+    delta = stop_landmarks - start_landmarks
+    for idx in range(1, stop_idx - start_idx):
+        landmarks[start_idx + idx] = start_landmarks + idx / float(stop_idx - start_idx) * delta
+    return landmarks
+
+
+def warp_img(src, dst, img, std_size):
+    tform = tf.estimate_transform("similarity", src, dst)
+    warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size)
+    warped = (warped * 255).astype("uint8")
+    return warped, tform
+
+
+def apply_transform(transform, img, std_size):
+    warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size)
+    warped = (warped * 255).astype("uint8")
+    return warped
+
+
+def cut_patch(img, landmarks, height, width, threshold=5):
+    center_x, center_y = np.mean(landmarks, axis=0)
+    # Check for too much bias in height and width
+    if abs(center_y - img.shape[0] / 2) > height + threshold:
+        raise Exception("too much bias in height")
+    if abs(center_x - img.shape[1] / 2) > width + threshold:
+        raise Exception("too much bias in width")
+    # Calculate bounding box coordinates
+    y_min = int(round(np.clip(center_y - height, 0, img.shape[0])))
+    y_max = int(round(np.clip(center_y + height, 0, img.shape[0])))
+    x_min = int(round(np.clip(center_x - width, 0, img.shape[1])))
+    x_max = int(round(np.clip(center_x + width, 0, img.shape[1])))
+    # Cut the image
+    cutted_img = np.copy(img[y_min:y_max, x_min:x_max])
+    return cutted_img
+
+
+class VideoProcess:
+    def __init__(
+        self,
+        crop_width=128,
+        crop_height=128,
+        target_size=(224, 224),
+        reference_size=(224, 224),
+        stable_points=(0, 1),
+        start_idx=0,
+        stop_idx=2,
+        resize=(96, 96),
+    ):
+        self.reference = np.array(([[51.64568, 0.70204943], [171.95107, 159.59505]]))
+        self.crop_width = crop_width
+        self.crop_height = crop_height
+        self.start_idx = start_idx
+        self.stop_idx = stop_idx
+        self.resize = resize
+
+    def __call__(self, video, landmarks):
+        # Pre-process landmarks: interpolate frames that are not detected
+        preprocessed_landmarks = self.interpolate_landmarks(landmarks)
+        # Exclude corner cases: no landmark in all frames or number of frames is less than window length
+        if not preprocessed_landmarks:
+            return
+        # Affine transformation and crop patch
+        sequence = self.crop_patch(video, preprocessed_landmarks)
+        assert sequence is not None, "crop an empty patch."
+        return sequence
+
+    def crop_patch(self, video, landmarks):
+        sequence = []
+        for frame_idx, frame in enumerate(video):
+            transformed_frame, transformed_landmarks = self.affine_transform(
+                frame, landmarks[frame_idx], self.reference
+            )
+            patch = cut_patch(
+                transformed_frame,
+                transformed_landmarks[self.start_idx : self.stop_idx],
+                self.crop_height // 2,
+                self.crop_width // 2,
+            )
+            if self.resize:
+                patch = cv2.resize(patch, self.resize)
+            sequence.append(patch)
+        return np.array(sequence)
+
+    def interpolate_landmarks(self, landmarks):
+        valid_frames_idx = [idx for idx, lm in enumerate(landmarks) if lm is not None]
+
+        if not valid_frames_idx:
+            return None
+
+        for idx in range(1, len(valid_frames_idx)):
+            if valid_frames_idx[idx] - valid_frames_idx[idx - 1] > 1:
+                landmarks = linear_interpolate(landmarks, valid_frames_idx[idx - 1], valid_frames_idx[idx])
+
+        valid_frames_idx = [idx for idx, lm in enumerate(landmarks) if lm is not None]
+
+        # Handle corner case: keep frames at the beginning or at the end that failed to be detected
+        if valid_frames_idx:
+            landmarks[: valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0]
+            landmarks[valid_frames_idx[-1] :] = [landmarks[valid_frames_idx[-1]]] * (
+                len(landmarks) - valid_frames_idx[-1]
+            )
+
+        assert all(lm is not None for lm in landmarks), "not every frame has landmark"
+
+        return landmarks
+
+    def affine_transform(
+        self,
+        frame,
+        landmarks,
+        reference,
+        target_size=(224, 224),
+        reference_size=(224, 224),
+        stable_points=(0, 1),
+        interpolation=cv2.INTER_LINEAR,
+        border_mode=cv2.BORDER_CONSTANT,
+        border_value=0,
+    ):
+        stable_reference = self.get_stable_reference(reference, stable_points, reference_size, target_size)
+        transform = self.estimate_affine_transform(landmarks, stable_points, stable_reference)
+        transformed_frame, transformed_landmarks = self.apply_affine_transform(
+            frame, landmarks, transform, target_size, interpolation, border_mode, border_value
+        )
+
+        return transformed_frame, transformed_landmarks
+
+    def get_stable_reference(self, reference, stable_points, reference_size, target_size):
+        stable_reference = np.vstack([reference[x] for x in stable_points])
+        stable_reference[:, 0] -= (reference_size[0] - target_size[0]) / 2.0
+        stable_reference[:, 1] -= (reference_size[1] - target_size[1]) / 2.0
+        return stable_reference
+
+    def estimate_affine_transform(self, landmarks, stable_points, stable_reference):
+        return cv2.estimateAffinePartial2D(
+            np.vstack([landmarks[x] for x in stable_points]), stable_reference, method=cv2.LMEDS
+        )[0]
+
+    def apply_affine_transform(
+        self, frame, landmarks, transform, target_size, interpolation, border_mode, border_value
+    ):
+        transformed_frame = cv2.warpAffine(
+            frame,
+            transform,
+            dsize=(target_size[0], target_size[1]),
+            flags=interpolation,
+            borderMode=border_mode,
+            borderValue=border_value,
+        )
+        transformed_landmarks = np.matmul(landmarks, transform[:, :2].transpose()) + transform[:, 2].transpose()
+        return transformed_frame, transformed_landmarks
diff --git a/examples/asr/avsr_rnnt/data_prep/main.py b/examples/asr/avsr_rnnt/data_prep/main.py
new file mode 100644
index 0000000000..c9826f447b
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/main.py
@@ -0,0 +1,203 @@
+import glob
+import math
+import os
+import shutil
+
+import warnings
+
+import ffmpeg
+from data.data_module import AVSRDataLoader
+from tqdm import tqdm
+from utils import save_vid_aud_txt, split_file
+
+warnings.filterwarnings("ignore")
+
+from argparse import ArgumentParser
+
+
+def load_args(default_config=None):
+    parser = ArgumentParser(description="Preprocess LRS3 to crop full-face images")
+    # -- for benchmark evaluation
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        help="The directory for sequence.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="Specify the dataset name used in the experiment",
+    )
+    parser.add_argument(
+        "--root-dir",
+        type=str,
+        help="The root directory of cropped-face dataset.",
+    )
+    parser.add_argument("--job-index", type=int, default=0, help="job index")
+    parser.add_argument(
+        "--groups",
+        type=int,
+        default=1,
+        help="specify the number of threads to be used",
+    )
+    parser.add_argument(
+        "--folder",
+        type=str,
+        default="test",
+        help="specify the set used in the experiment",
+    )
+    args = parser.parse_args()
+    return args
+
+
+args = load_args()
+
+seg_duration = 16
+detector = "retinaface"
+dataset = args.dataset
+
+args.data_dir = os.path.normpath(args.data_dir)
+vid_dataloader = AVSRDataLoader(modality="video", detector=detector, resize=(96, 96))
+aud_dataloader = AVSRDataLoader(modality="audio")
+# Step 2, extract mouth patches from segments.
+seg_vid_len = seg_duration * 25
+seg_aud_len = seg_duration * 16000
+
+label_filename = os.path.join(
+    args.root_dir,
+    "labels",
+    f"{dataset}_{args.folder}_transcript_lengths_seg{seg_duration}s.csv"
+    if args.groups <= 1
+    else f"{dataset}_{args.folder}_transcript_lengths_seg{seg_duration}s.{args.groups}.{args.job_index}.csv",
+)
+os.makedirs(os.path.dirname(label_filename), exist_ok=True)
+print(f"Directory {os.path.dirname(label_filename)} created")
+
+f = open(label_filename, "w")
+# Step 2, extract mouth patches from segments.
+dst_vid_dir = os.path.join(args.root_dir, dataset, dataset + f"_video_seg{seg_duration}s")
+dst_txt_dir = os.path.join(args.root_dir, dataset, dataset + f"_text_seg{seg_duration}s")
+if args.folder == "test":
+    filenames = glob.glob(os.path.join(args.data_dir, args.folder, "**", "*.mp4"), recursive=True)
+elif args.folder == "train":
+    filenames = glob.glob(os.path.join(args.data_dir, "trainval", "**", "*.mp4"), recursive=True)
+    filenames.extend(glob.glob(os.path.join(args.data_dir, "pretrain", "**", "*.mp4"), recursive=True))
+    filenames.sort()
+else:
+    raise NotImplementedError
+
+unit = math.ceil(len(filenames) * 1.0 / args.groups)
+filenames = filenames[args.job_index * unit : (args.job_index + 1) * unit]
+
+for data_filename in tqdm(filenames):
+    try:
+        video_data = vid_dataloader.load_data(data_filename)
+        audio_data = aud_dataloader.load_data(data_filename)
+    except UnboundLocalError:
+        continue
+
+    if os.path.normpath(data_filename).split(os.sep)[-3] in ["trainval", "test", "main"]:
+        dst_vid_filename = f"{data_filename.replace(args.data_dir, dst_vid_dir)[:-4]}.mp4"
+        dst_aud_filename = f"{data_filename.replace(args.data_dir, dst_vid_dir)[:-4]}.wav"
+        dst_txt_filename = f"{data_filename.replace(args.data_dir, dst_txt_dir)[:-4]}.txt"
+        trim_vid_data, trim_aud_data = video_data, audio_data
+        text_line_list = open(data_filename[:-4] + ".txt", "r").read().splitlines()[0].split(" ")
+        text_line = " ".join(text_line_list[2:])
+        content = text_line.replace("}", "").replace("{", "")
+
+        if trim_vid_data is None or trim_aud_data is None:
+            continue
+        video_length = len(trim_vid_data)
+        audio_length = trim_aud_data.size(1)
+        if video_length == 0 or audio_length == 0:
+            continue
+        if audio_length / video_length < 560.0 or audio_length / video_length > 720.0 or video_length < 12:
+            continue
+        save_vid_aud_txt(
+            dst_vid_filename,
+            dst_aud_filename,
+            dst_txt_filename,
+            trim_vid_data,
+            trim_aud_data,
+            content,
+            video_fps=25,
+            audio_sample_rate=16000,
+        )
+
+        in1 = ffmpeg.input(dst_vid_filename)
+        in2 = ffmpeg.input(dst_aud_filename)
+        out = ffmpeg.output(
+            in1["v"],
+            in2["a"],
+            dst_vid_filename[:-4] + ".m.mp4",
+            vcodec="copy",
+            acodec="aac",
+            strict="experimental",
+            loglevel="panic",
+        )
+        out.run()
+        os.remove(dst_aud_filename)
+        os.remove(dst_vid_filename)
+        shutil.move(dst_vid_filename[:-4] + ".m.mp4", dst_vid_filename)
+
+        basename = os.path.relpath(dst_vid_filename, start=os.path.join(args.root_dir, dataset))
+        f.write("{}\n".format(f"{dataset},{basename},{trim_vid_data.shape[0]},{len(content)}"))
+        continue
+
+    splitted = split_file(data_filename[:-4] + ".txt", max_frames=seg_vid_len)
+    for i in range(len(splitted)):
+        if len(splitted) == 1:
+            content, start, end, duration = splitted[i]
+            trim_vid_data, trim_aud_data = video_data, audio_data
+        else:
+            content, start, end, duration = splitted[i]
+            start_idx, end_idx = int(start * 25), int(end * 25)
+            try:
+                trim_vid_data, trim_aud_data = (
+                    video_data[start_idx:end_idx],
+                    audio_data[:, start_idx * 640 : end_idx * 640],
+                )
+            except TypeError:
+                continue
+        dst_vid_filename = f"{data_filename.replace(args.data_dir, dst_vid_dir)[:-4]}_{i:02d}.mp4"
+        dst_aud_filename = f"{data_filename.replace(args.data_dir, dst_vid_dir)[:-4]}_{i:02d}.wav"
+        dst_txt_filename = f"{data_filename.replace(args.data_dir, dst_txt_dir)[:-4]}_{i:02d}.txt"
+
+        if trim_vid_data is None or trim_aud_data is None:
+            continue
+        video_length = len(trim_vid_data)
+        audio_length = trim_aud_data.size(1)
+        if video_length == 0 or audio_length == 0:
+            continue
+        if audio_length / video_length < 560.0 or audio_length / video_length > 720.0 or video_length < 12:
+            continue
+        save_vid_aud_txt(
+            dst_vid_filename,
+            dst_aud_filename,
+            dst_txt_filename,
+            trim_vid_data,
+            trim_aud_data,
+            content,
+            video_fps=25,
+            audio_sample_rate=16000,
+        )
+
+        in1 = ffmpeg.input(dst_vid_filename)
+        in2 = ffmpeg.input(dst_aud_filename)
+        out = ffmpeg.output(
+            in1["v"],
+            in2["a"],
+            dst_vid_filename[:-4] + ".m.mp4",
+            vcodec="copy",
+            acodec="aac",
+            strict="experimental",
+            loglevel="panic",
+        )
+        out.run()
+        os.remove(dst_aud_filename)
+        os.remove(dst_vid_filename)
+        shutil.move(dst_vid_filename[:-4] + ".m.mp4", dst_vid_filename)
+
+        basename = os.path.relpath(dst_vid_filename, start=os.path.join(args.root_dir, dataset))
+        f.write("{}\n".format(f"{dataset},{basename},{trim_vid_data.shape[0]},{len(content)}"))
+f.close()
diff --git a/examples/asr/avsr_rnnt/data_prep/merge.py b/examples/asr/avsr_rnnt/data_prep/merge.py
new file mode 100644
index 0000000000..132a3cce7d
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/merge.py
@@ -0,0 +1,78 @@
+import os
+from argparse import ArgumentParser
+
+
+def load_args(default_config=None):
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="Specify the dataset name used in the experiment",
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        help="Specify the set used in the experiment",
+    )
+    parser.add_argument(
+        "--root-dir",
+        type=str,
+        help="The root directory of saved mouth patches or embeddings.",
+    )
+    parser.add_argument(
+        "--groups",
+        type=int,
+        help="Specify the number of threads to be used",
+    )
+    parser.add_argument(
+        "--seg-duration",
+        type=int,
+        default=16,
+        help="Specify the segment length",
+    )
+    args = parser.parse_args()
+    return args
+
+
+args = load_args()
+
+dataset = args.dataset
+subset = args.subset
+seg_duration = args.seg_duration
+
+# Check that there is more than one group
+assert args.groups > 1, "There is no need to use this script for merging when --groups is 1."
+
+# Create the filename template for label files
+label_template = os.path.join(
+    args.root_dir, "labels", f"{dataset}_{subset}_transcript_lengths_seg{seg_duration}s.{args.groups}"
+)
+
+lines = []
+for job_index in range(args.groups):
+    label_filename = f"{label_template}.{job_index}.csv"
+    assert os.path.exists(label_filename), f"{label_filename} does not exist."
+
+    with open(label_filename, "r") as file:
+        lines.extend(file.read().splitlines())
+
+# Write the merged labels to a new file
+dst_label_filename = os.path.join(
+    args.root_dir, dataset, f"{dataset}_{subset}_transcript_lengths_seg{seg_duration}s.csv"
+)
+
+with open(dst_label_filename, "w") as file:
+    file.write("\n".join(lines))
+
+# Print the number of files and total duration in hours
+total_duration = sum(int(line.split(",")[2]) for line in lines) / 3600.0 / 25.0
+print(f"The completed set has {len(lines)} files with a total of {total_duration} hours.")
+
+# Remove the label files for each job index
+print("** Remove the temporary label files **")
+for job_index in range(args.groups):
+    label_filename = f"{label_template}.{job_index}.csv"
+    if os.path.exists(label_filename):
+        os.remove(label_filename)
+
+print("** Finish **")
diff --git a/examples/asr/avsr_rnnt/data_prep/requirements.txt b/examples/asr/avsr_rnnt/data_prep/requirements.txt
new file mode 100644
index 0000000000..3c1dd8f6dd
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/requirements.txt
@@ -0,0 +1,3 @@
+scikit-image
+opencv-python
+ffmpeg-python
diff --git a/examples/asr/avsr_rnnt/data_prep/tools/README.md b/examples/asr/avsr_rnnt/data_prep/tools/README.md
new file mode 100644
index 0000000000..9d30695ae2
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/tools/README.md
@@ -0,0 +1,18 @@
+## Face Recognition
+We provide [ibug.face_detection](https://github.com/hhj1897/face_detection) in this repository.
+
+### Prerequisites
+* [Git LFS](https://git-lfs.github.com/), needed for downloading the pretrained weights that are larger than 100 MB.
+
+You could install *`Homebrew`* and then install *`git-lfs`* without sudo priviledges.
+
+### From source
+
+1. Install *`ibug.face_detection`*
+```Shell
+git clone https://github.com/hhj1897/face_detection.git
+cd face_detection
+git lfs pull
+pip install -e .
+cd ..
+```
diff --git a/examples/asr/avsr_rnnt/data_prep/utils.py b/examples/asr/avsr_rnnt/data_prep/utils.py
new file mode 100644
index 0000000000..a557a94899
--- /dev/null
+++ b/examples/asr/avsr_rnnt/data_prep/utils.py
@@ -0,0 +1,87 @@
+import os
+
+import torchaudio
+import torchvision
+
+
+def split_file(filename, max_frames=600, fps=25.0):
+
+    lines = open(filename).read().splitlines()
+
+    flag = 0
+    stack = []
+    res = []
+
+    tmp = 0
+    start_timestamp = 0.0
+
+    threshold = max_frames / fps
+
+    for line in lines:
+        if "WORD START END ASDSCORE" in line:
+            flag = 1
+            continue
+        if flag:
+            word, start, end, score = line.split(" ")
+            start, end, score = float(start), float(end), float(score)
+            if end < tmp + threshold:
+                stack.append(word)
+                last_timestamp = end
+            else:
+                res.append([" ".join(stack), start_timestamp, last_timestamp, last_timestamp - start_timestamp])
+                tmp = start
+                start_timestamp = start
+                stack = [word]
+    if stack:
+        res.append([" ".join(stack), start_timestamp, end, end - start_timestamp])
+    return res
+
+
+def save_vid_txt(dst_vid_filename, dst_txt_filename, trim_video_data, content, video_fps=25):
+    # -- save video
+    save2vid(dst_vid_filename, trim_video_data, video_fps)
+    # -- save text
+    os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True)
+    f = open(dst_txt_filename, "w")
+    f.write(f"{content}")
+    f.close()
+
+
+def save_vid_aud(
+    dst_vid_filename, dst_aud_filename, trim_vid_data, trim_aud_data, video_fps=25, audio_sample_rate=16000
+):
+    # -- save video
+    save2vid(dst_vid_filename, trim_vid_data, video_fps)
+    # -- save audio
+    save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate)
+
+
+def save_vid_aud_txt(
+    dst_vid_filename,
+    dst_aud_filename,
+    dst_txt_filename,
+    trim_vid_data,
+    trim_aud_data,
+    content,
+    video_fps=25,
+    audio_sample_rate=16000,
+):
+    # -- save video
+    save2vid(dst_vid_filename, trim_vid_data, video_fps)
+    # -- save audio
+    save2aud(dst_aud_filename, trim_aud_data, audio_sample_rate)
+    # -- save text
+    os.makedirs(os.path.dirname(dst_txt_filename), exist_ok=True)
+    f = open(dst_txt_filename, "w")
+    f.write(f"{content}")
+    f.close()
+
+
+def save2vid(filename, vid, frames_per_second):
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    torchvision.io.write_video(filename, vid, frames_per_second)
+
+
+def save2aud(filename, aud, sample_rate):
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    torchaudio.save(filename, aud, sample_rate)
diff --git a/examples/asr/avsr_rnnt/doc/lip_white.png b/examples/asr/avsr_rnnt/doc/lip_white.png
new file mode 100644
index 0000000000..2d6f20ec13
Binary files /dev/null and b/examples/asr/avsr_rnnt/doc/lip_white.png differ
diff --git a/examples/asr/avsr_rnnt/eval.py b/examples/asr/avsr_rnnt/eval.py
new file mode 100644
index 0000000000..174c348f00
--- /dev/null
+++ b/examples/asr/avsr_rnnt/eval.py
@@ -0,0 +1,105 @@
+import logging
+from argparse import ArgumentParser
+
+import sentencepiece as spm
+import torch
+import torchaudio
+from transforms import get_data_module
+
+
+logger = logging.getLogger(__name__)
+
+
+def compute_word_level_distance(seq1, seq2):
+    return torchaudio.functional.edit_distance(seq1.lower().split(), seq2.lower().split())
+
+
+def get_lightning_module(args):
+    sp_model = spm.SentencePieceProcessor(model_file=str(args.sp_model_path))
+    if args.md == "av":
+        from lightning_av import AVConformerRNNTModule
+
+        model = AVConformerRNNTModule(args, sp_model)
+    else:
+        from lightning import ConformerRNNTModule
+
+        model = ConformerRNNTModule(args, sp_model)
+    ckpt = torch.load(args.checkpoint_path, map_location=lambda storage, loc: storage)["state_dict"]
+    model.load_state_dict(ckpt)
+    model.eval()
+    return model
+
+
+def run_eval(model, data_module):
+    total_edit_distance = 0
+    total_length = 0
+    dataloader = data_module.test_dataloader()
+    with torch.no_grad():
+        for idx, (batch, sample) in enumerate(dataloader):
+            actual = sample[0][-1]
+            predicted = model(batch)
+            total_edit_distance += compute_word_level_distance(actual, predicted)
+            total_length += len(actual.split())
+            if idx % 100 == 0:
+                logger.warning(f"Processed elem {idx}; WER: {total_edit_distance / total_length}")
+    logger.warning(f"Final WER: {total_edit_distance / total_length}")
+    return total_edit_distance / total_length
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--md",
+        type=str,
+        help="Modality",
+        required=True,
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        help="Perform online or offline recognition.",
+        required=True,
+    )
+    parser.add_argument(
+        "--root-dir",
+        type=str,
+        help="Root directory to LRS3 audio-visual datasets.",
+        required=True,
+    )
+    parser.add_argument(
+        "--sp-model-path",
+        type=str,
+        help="Path to SentencePiece model.",
+        required=True,
+    )
+    parser.add_argument(
+        "--checkpoint-path",
+        type=str,
+        help="Path to checkpoint model.",
+        required=True,
+    )
+    parser.add_argument(
+        "--pretrained-model-path",
+        type=str,
+        help="Path to Pretraned model.",
+    )
+    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
+    return parser.parse_args()
+
+
+def init_logger(debug):
+    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
+
+
+def cli_main():
+    args = parse_args()
+    init_logger(args.debug)
+    model = get_lightning_module(args)
+    data_module = get_data_module(args, str(args.sp_model_path))
+    run_eval(model, data_module)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/examples/asr/avsr_rnnt/lightning.py b/examples/asr/avsr_rnnt/lightning.py
new file mode 100644
index 0000000000..ed2c24edbc
--- /dev/null
+++ b/examples/asr/avsr_rnnt/lightning.py
@@ -0,0 +1,168 @@
+import itertools
+import math
+
+from collections import namedtuple
+from typing import List, Tuple
+
+import sentencepiece as spm
+
+import torch
+import torchaudio
+from models.conformer_rnnt import conformer_rnnt
+from models.emformer_rnnt import emformer_rnnt
+from models.resnet import video_resnet
+from models.resnet1d import audio_resnet
+from pytorch_lightning import LightningModule
+from schedulers import WarmupCosineScheduler
+from torchaudio.models import Hypothesis, RNNTBeamSearch
+
+_expected_spm_vocab_size = 1023
+
+Batch = namedtuple("Batch", ["inputs", "input_lengths", "targets", "target_lengths"])
+
+
+def post_process_hypos(
+    hypos: List[Hypothesis], sp_model: spm.SentencePieceProcessor
+) -> List[Tuple[str, float, List[int], List[int]]]:
+    tokens_idx = 0
+    score_idx = 3
+    post_process_remove_list = [
+        sp_model.unk_id(),
+        sp_model.eos_id(),
+        sp_model.pad_id(),
+    ]
+    filtered_hypo_tokens = [
+        [token_index for token_index in h[tokens_idx][1:] if token_index not in post_process_remove_list] for h in hypos
+    ]
+    hypos_str = [sp_model.decode(s) for s in filtered_hypo_tokens]
+    hypos_ids = [h[tokens_idx][1:] for h in hypos]
+    hypos_score = [[math.exp(h[score_idx])] for h in hypos]
+
+    nbest_batch = list(zip(hypos_str, hypos_score, hypos_ids))
+
+    return nbest_batch
+
+
+class ConformerRNNTModule(LightningModule):
+    def __init__(self, args=None, sp_model=None, pretrained_model_path=None):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.args = args
+        self.sp_model = sp_model
+        spm_vocab_size = self.sp_model.get_piece_size()
+        assert spm_vocab_size == _expected_spm_vocab_size, (
+            "The model returned by conformer_rnnt_base expects a SentencePiece model of "
+            f"vocabulary size {_expected_spm_vocab_size}, but the given SentencePiece model has a vocabulary size "
+            f"of {spm_vocab_size}. Please provide a correctly configured SentencePiece model."
+        )
+        self.blank_idx = spm_vocab_size
+
+        if args.md == "v":
+            self.frontend = video_resnet()
+        if args.md == "a":
+            self.frontend = audio_resnet()
+
+        if args.mode == "online":
+            self.model = emformer_rnnt()
+        if args.mode == "offline":
+            self.model = conformer_rnnt()
+
+        # -- initialise
+        if args.pretrained_model_path:
+            ckpt = torch.load(args.pretrained_model_path, map_location=lambda storage, loc: storage)
+            tmp_ckpt = {
+                k.replace("encoder.frontend.", ""): v for k, v in ckpt.items() if k.startswith("encoder.frontend.")
+            }
+            self.frontend.load_state_dict(tmp_ckpt)
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum")
+
+        self.optimizer = torch.optim.AdamW(
+            itertools.chain(*([self.frontend.parameters(), self.model.parameters()])),
+            lr=8e-4,
+            weight_decay=0.06,
+            betas=(0.9, 0.98),
+        )
+
+        self.automatic_optimization = False
+
+    def _step(self, batch, _, step_type):
+        if batch is None:
+            return None
+
+        prepended_targets = batch.targets.new_empty([batch.targets.size(0), batch.targets.size(1) + 1])
+        prepended_targets[:, 1:] = batch.targets
+        prepended_targets[:, 0] = self.blank_idx
+        prepended_target_lengths = batch.target_lengths + 1
+        features = self.frontend(batch.inputs)
+        output, src_lengths, _, _ = self.model(
+            features, batch.input_lengths, prepended_targets, prepended_target_lengths
+        )
+        loss = self.loss(output, batch.targets, src_lengths, batch.target_lengths)
+        self.log(f"Losses/{step_type}_loss", loss, on_step=True, on_epoch=True)
+
+        return loss
+
+    def configure_optimizers(self):
+        self.warmup_lr_scheduler = WarmupCosineScheduler(
+            self.optimizer,
+            10,
+            self.args.epochs,
+            len(self.trainer.datamodule.train_dataloader()) / self.trainer.num_devices / self.trainer.num_nodes,
+        )
+        self.lr_scheduler_interval = "step"
+        return (
+            [self.optimizer],
+            [{"scheduler": self.warmup_lr_scheduler, "interval": self.lr_scheduler_interval}],
+        )
+
+    def forward(self, batch: Batch):
+        decoder = RNNTBeamSearch(self.model, self.blank_idx)
+        x = self.frontend(batch.inputs.to(self.device))
+        hypotheses = decoder(x, batch.input_lengths.to(self.device), beam_width=20)
+        return post_process_hypos(hypotheses, self.sp_model)[0][0]
+
+    def training_step(self, batch: Batch, batch_idx):
+        """Custom training step.
+
+        By default, DDP does the following on each train step:
+        - For each GPU, compute loss and gradient on shard of training data.
+        - Sync and average gradients across all GPUs. The final gradient
+          is (sum of gradients across all GPUs) / N, where N is the world
+          size (total number of GPUs).
+        - Update parameters on each GPU.
+
+        Here, we do the following:
+        - For k-th GPU, compute loss and scale it by (N / B_total), where B_total is
+          the sum of batch sizes across all GPUs. Compute gradient from scaled loss.
+        - Sync and average gradients across all GPUs. The final gradient
+          is (sum of gradients across all GPUs) / B_total.
+        - Update parameters on each GPU.
+
+        Doing so allows us to account for the variability in batch sizes that
+        variable-length sequential data commonly yields.
+        """
+
+        opt = self.optimizers()
+        opt.zero_grad()
+        loss = self._step(batch, batch_idx, "train")
+        batch_size = batch.inputs.size(0)
+        batch_sizes = self.all_gather(batch_size)
+
+        loss *= batch_sizes.size(0) / batch_sizes.sum()  # world size / batch size
+        self.manual_backward(loss)
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10)
+        opt.step()
+
+        sch = self.lr_schedulers()
+        sch.step()
+
+        self.log("monitoring_step", self.global_step)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "val")
+
+    def test_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "test")
diff --git a/examples/asr/avsr_rnnt/lightning_av.py b/examples/asr/avsr_rnnt/lightning_av.py
new file mode 100644
index 0000000000..b0730d43e2
--- /dev/null
+++ b/examples/asr/avsr_rnnt/lightning_av.py
@@ -0,0 +1,173 @@
+import itertools
+import math
+
+from collections import namedtuple
+from typing import List, Tuple
+
+import sentencepiece as spm
+
+import torch
+import torchaudio
+from models.conformer_rnnt import conformer_rnnt
+from models.emformer_rnnt import emformer_rnnt
+from models.fusion import fusion_module
+from models.resnet import video_resnet
+from models.resnet1d import audio_resnet
+from pytorch_lightning import LightningModule
+from schedulers import WarmupCosineScheduler
+from torchaudio.models import Hypothesis, RNNTBeamSearch
+
+
+_expected_spm_vocab_size = 1023
+
+AVBatch = namedtuple("AVBatch", ["audios", "videos", "audio_lengths", "video_lengths", "targets", "target_lengths"])
+
+
+def post_process_hypos(
+    hypos: List[Hypothesis], sp_model: spm.SentencePieceProcessor
+) -> List[Tuple[str, float, List[int], List[int]]]:
+    tokens_idx = 0
+    score_idx = 3
+    post_process_remove_list = [
+        sp_model.unk_id(),
+        sp_model.eos_id(),
+        sp_model.pad_id(),
+    ]
+    filtered_hypo_tokens = [
+        [token_index for token_index in h[tokens_idx][1:] if token_index not in post_process_remove_list] for h in hypos
+    ]
+    hypos_str = [sp_model.decode(s) for s in filtered_hypo_tokens]
+    hypos_ids = [h[tokens_idx][1:] for h in hypos]
+    hypos_score = [[math.exp(h[score_idx])] for h in hypos]
+
+    nbest_batch = list(zip(hypos_str, hypos_score, hypos_ids))
+
+    return nbest_batch
+
+
+class AVConformerRNNTModule(LightningModule):
+    def __init__(self, args=None, sp_model=None):
+        super().__init__()
+        self.save_hyperparameters(args)
+        self.args = args
+        self.sp_model = sp_model
+        spm_vocab_size = self.sp_model.get_piece_size()
+        assert spm_vocab_size == _expected_spm_vocab_size, (
+            "The model returned by conformer_rnnt_base expects a SentencePiece model of "
+            f"vocabulary size {_expected_spm_vocab_size}, but the given SentencePiece model has a vocabulary size "
+            f"of {spm_vocab_size}. Please provide a correctly configured SentencePiece model."
+        )
+        self.blank_idx = spm_vocab_size
+
+        self.audio_frontend = audio_resnet()
+        self.video_frontend = video_resnet()
+        self.fusion = fusion_module()
+
+        frontend_params = [self.video_frontend.parameters(), self.audio_frontend.parameters()]
+        fusion_params = [self.fusion.parameters()]
+
+        if args.mode == "online":
+            self.model = emformer_rnnt()
+        if args.mode == "offline":
+            self.model = conformer_rnnt()
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum")
+
+        self.optimizer = torch.optim.AdamW(
+            itertools.chain(*([self.model.parameters()] + frontend_params + fusion_params)),
+            lr=8e-4,
+            weight_decay=0.06,
+            betas=(0.9, 0.98),
+        )
+
+        self.automatic_optimization = False
+
+    def _step(self, batch, _, step_type):
+        if batch is None:
+            return None
+
+        prepended_targets = batch.targets.new_empty([batch.targets.size(0), batch.targets.size(1) + 1])
+        prepended_targets[:, 1:] = batch.targets
+        prepended_targets[:, 0] = self.blank_idx
+        prepended_target_lengths = batch.target_lengths + 1
+        video_features = self.video_frontend(batch.videos)
+        audio_features = self.audio_frontend(batch.audios)
+        output, src_lengths, _, _ = self.model(
+            self.fusion(torch.cat([video_features, audio_features], dim=-1)),
+            batch.video_lengths,
+            prepended_targets,
+            prepended_target_lengths,
+        )
+        loss = self.loss(output, batch.targets, src_lengths, batch.target_lengths)
+        self.log(f"Losses/{step_type}_loss", loss, on_step=True, on_epoch=True)
+
+        return loss
+
+    def configure_optimizers(self):
+        self.warmup_lr_scheduler = WarmupCosineScheduler(
+            self.optimizer,
+            10,
+            self.args.epochs,
+            len(self.trainer.datamodule.train_dataloader()) / self.trainer.num_devices / self.trainer.num_nodes,
+        )
+        self.lr_scheduler_interval = "step"
+        return (
+            [self.optimizer],
+            [{"scheduler": self.warmup_lr_scheduler, "interval": self.lr_scheduler_interval}],
+        )
+
+    def forward(self, batch: AVBatch):
+        decoder = RNNTBeamSearch(self.model, self.blank_idx)
+        video_features = self.video_frontend(batch.videos.to(self.device))
+        audio_features = self.audio_frontend(batch.audios.to(self.device))
+        hypotheses = decoder(
+            self.fusion(torch.cat([video_features, audio_features], dim=-1)),
+            batch.video_lengths.to(self.device),
+            beam_width=20,
+        )
+        return post_process_hypos(hypotheses, self.sp_model)[0][0]
+
+    def training_step(self, batch: AVBatch, batch_idx):
+        """Custom training step.
+
+        By default, DDP does the following on each train step:
+        - For each GPU, compute loss and gradient on shard of training data.
+        - Sync and average gradients across all GPUs. The final gradient
+          is (sum of gradients across all GPUs) / N, where N is the world
+          size (total number of GPUs).
+        - Update parameters on each GPU.
+
+        Here, we do the following:
+        - For k-th GPU, compute loss and scale it by (N / B_total), where B_total is
+          the sum of batch sizes across all GPUs. Compute gradient from scaled loss.
+        - Sync and average gradients across all GPUs. The final gradient
+          is (sum of gradients across all GPUs) / B_total.
+        - Update parameters on each GPU.
+
+        Doing so allows us to account for the variability in batch sizes that
+        variable-length sequential data commonly yields.
+        """
+
+        opt = self.optimizers()
+        opt.zero_grad()
+        loss = self._step(batch, batch_idx, "train")
+        batch_size = batch.videos.size(0)
+        batch_sizes = self.all_gather(batch_size)
+
+        loss *= batch_sizes.size(0) / batch_sizes.sum()  # world size / batch size
+        self.manual_backward(loss)
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 10)
+        opt.step()
+
+        sch = self.lr_schedulers()
+        sch.step()
+
+        self.log("monitoring_step", self.global_step)
+
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "val")
+
+    def test_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "test")
diff --git a/examples/asr/avsr_rnnt/lrs3.py b/examples/asr/avsr_rnnt/lrs3.py
new file mode 100644
index 0000000000..5077f67c70
--- /dev/null
+++ b/examples/asr/avsr_rnnt/lrs3.py
@@ -0,0 +1,76 @@
+import os
+
+import torchaudio
+import torchvision
+from torch.utils.data import Dataset
+
+
+def _load_list(args, *filenames):
+    output = []
+    length = []
+    for filename in filenames:
+        filepath = os.path.join(args.root_dir, "labels", filename)
+        for line in open(filepath).read().splitlines():
+            dataset, rel_path, input_length = line.split(",")[0], line.split(",")[1], line.split(",")[2]
+            path = os.path.normpath(os.path.join(args.root_dir, dataset, rel_path[:-4] + ".mp4"))
+            length.append(int(input_length))
+            output.append(path)
+    return output, length
+
+
+def load_video(path):
+    """
+    rtype: torch, T x C x H x W
+    """
+    vid = torchvision.io.read_video(path, pts_unit="sec", output_format="THWC")[0]
+    vid = vid.permute((0, 3, 1, 2))
+    return vid
+
+
+def load_audio(path):
+    """
+    rtype: torch, T x 1
+    """
+    waveform, sample_rate = torchaudio.load(path, normalize=True)
+    return waveform.transpose(1, 0)
+
+
+def load_transcript(path):
+    transcript_path = path.replace("video_seg", "text_seg")[:-4] + ".txt"
+    return open(transcript_path).read().splitlines()[0]
+
+
+def load_item(path, md):
+    if md == "v":
+        return (load_video(path), load_transcript(path))
+    if md == "a":
+        return (load_audio(path), load_transcript(path))
+    if md == "av":
+        return (load_audio(path), load_video(path), load_transcript(path))
+
+
+class LRS3(Dataset):
+    def __init__(
+        self,
+        args,
+        subset: str = "train",
+    ) -> None:
+
+        if subset is not None and subset not in ["train", "val", "test"]:
+            raise ValueError("When `subset` is not None, it must be one of ['train', 'val', 'test'].")
+
+        self.args = args
+
+        if subset == "train":
+            self._filelist, self._lengthlist = _load_list(self.args, "lrs3_train_transcript_lengths_seg16s.csv")
+        if subset == "val":
+            self._filelist, self._lengthlist = _load_list(self.args, "lrs3_test_transcript_lengths_seg16s.csv")
+        if subset == "test":
+            self._filelist, self._lengthlist = _load_list(self.args, "lrs3_test_transcript_lengths_seg16s.csv")
+
+    def __getitem__(self, n):
+        path = self._filelist[n]
+        return load_item(path, self.args.md)
+
+    def __len__(self) -> int:
+        return len(self._filelist)
diff --git a/examples/asr/avsr_rnnt/models/conformer_rnnt.py b/examples/asr/avsr_rnnt/models/conformer_rnnt.py
new file mode 100644
index 0000000000..a1fb14d9d9
--- /dev/null
+++ b/examples/asr/avsr_rnnt/models/conformer_rnnt.py
@@ -0,0 +1,25 @@
+from torchaudio.prototype.models import conformer_rnnt_model
+
+# https://pytorch.org/audio/master/_modules/torchaudio/prototype/models/rnnt.html#conformer_rnnt_model
+
+
+def conformer_rnnt():
+    return conformer_rnnt_model(
+        input_dim=512,
+        encoding_dim=1024,
+        time_reduction_stride=1,
+        conformer_input_dim=256,
+        conformer_ffn_dim=1024,
+        conformer_num_layers=16,
+        conformer_num_heads=4,
+        conformer_depthwise_conv_kernel_size=31,
+        conformer_dropout=0.1,
+        num_symbols=1024,
+        symbol_embedding_dim=256,
+        num_lstm_layers=2,
+        lstm_hidden_dim=512,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-5,
+        lstm_dropout=0.3,
+        joiner_activation="tanh",
+    )
diff --git a/examples/asr/avsr_rnnt/models/emformer_rnnt.py b/examples/asr/avsr_rnnt/models/emformer_rnnt.py
new file mode 100644
index 0000000000..9cd4a4cdc1
--- /dev/null
+++ b/examples/asr/avsr_rnnt/models/emformer_rnnt.py
@@ -0,0 +1,28 @@
+from torchaudio.models.rnnt import emformer_rnnt_model
+
+
+# https://pytorch.org/audio/master/_modules/torchaudio/models/rnnt.html#emformer_rnnt_base
+def emformer_rnnt():
+    return emformer_rnnt_model(
+        input_dim=512,
+        encoding_dim=1024,
+        num_symbols=1024,
+        segment_length=64,
+        right_context_length=0,
+        time_reduction_input_dim=128,
+        time_reduction_stride=1,
+        transformer_num_heads=4,
+        transformer_ffn_dim=2048,
+        transformer_num_layers=20,
+        transformer_dropout=0.1,
+        transformer_activation="gelu",
+        transformer_left_context_length=30,
+        transformer_max_memory_size=0,
+        transformer_weight_init_scale_strategy="depthwise",
+        transformer_tanh_on_mem=True,
+        symbol_embedding_dim=512,
+        num_lstm_layers=3,
+        lstm_layer_norm=True,
+        lstm_layer_norm_epsilon=1e-3,
+        lstm_dropout=0.3,
+    )
diff --git a/examples/asr/avsr_rnnt/models/fusion.py b/examples/asr/avsr_rnnt/models/fusion.py
new file mode 100644
index 0000000000..8d2fd1f5e5
--- /dev/null
+++ b/examples/asr/avsr_rnnt/models/fusion.py
@@ -0,0 +1,36 @@
+import torch
+
+
+class FeedForwardModule(torch.nn.Module):
+    r"""Positionwise feed forward layer.
+
+    Args:
+        input_dim (int): input dimension.
+        hidden_dim (int): hidden dimension.
+        dropout (float, optional): dropout probability. (Default: 0.0)
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.sequential = torch.nn.Sequential(
+            torch.nn.LayerNorm(input_dim),
+            torch.nn.Linear(input_dim, hidden_dim, bias=True),
+            torch.nn.SiLU(),
+            torch.nn.Dropout(dropout),
+            torch.nn.Linear(hidden_dim, output_dim, bias=True),
+            torch.nn.Dropout(dropout),
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input (torch.Tensor): with shape `(*, D)`.
+
+        Returns:
+            torch.Tensor: output, with shape `(*, D)`.
+        """
+        return self.sequential(input)
+
+
+def fusion_module():
+    return FeedForwardModule(1024, 3072, 512, 0.1)
diff --git a/examples/asr/avsr_rnnt/models/resnet.py b/examples/asr/avsr_rnnt/models/resnet.py
new file mode 100644
index 0000000000..826f8ed07f
--- /dev/null
+++ b/examples/asr/avsr_rnnt/models/resnet.py
@@ -0,0 +1,237 @@
+import torch.nn as nn
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """conv3x3.
+    :param in_planes: int, number of channels in the input sequence.
+    :param out_planes: int,  number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False,
+    )
+
+
+def downsample_basic_block(inplanes, outplanes, stride):
+    """downsample_basic_block.
+    :param inplanes: int, number of channels in the input sequence.
+    :param outplanes: int, number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Sequential(
+        nn.Conv2d(
+            inplanes,
+            outplanes,
+            kernel_size=1,
+            stride=stride,
+            bias=False,
+        ),
+        nn.BatchNorm2d(outplanes),
+    )
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        relu_type="swish",
+    ):
+        """__init__.
+        :param inplanes: int, number of channels in the input sequence.
+        :param planes: int,  number of channels produced by the convolution.
+        :param stride: int, size of the convolving kernel.
+        :param downsample: boolean, if True, the temporal resolution is downsampled.
+        :param relu_type: str, type of activation function.
+        """
+        super(BasicBlock, self).__init__()
+
+        assert relu_type in ["relu", "prelu", "swish"]
+
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        if relu_type == "relu":
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        elif relu_type == "swish":
+            self.relu1 = nn.SiLU(inplace=True)
+            self.relu2 = nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+        # --------
+
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
+        """
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu2(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        relu_type="swish",
+    ):
+        super(ResNet, self).__init__()
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.downsample_block = downsample_basic_block
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        """_make_layer.
+        :param block: torch.nn.Module, class of blocks.
+        :param planes: int,  number of channels produced by the convolution.
+        :param blocks: int, number of layers in a block.
+        :param stride: int, size of the convolving kernel.
+        """
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block(
+                inplanes=self.inplanes,
+                outplanes=planes * block.expansion,
+                stride=stride,
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                relu_type=self.relu_type,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    relu_type=self.relu_type,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T, H, W).
+        """
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x
+
+
+# -- auxiliary functions
+def threeD_to_2D_tensor(x):
+    n_batch, n_channels, s_time, sx, sy = x.shape
+    x = x.transpose(1, 2)
+    return x.reshape(n_batch * s_time, n_channels, sx, sy)
+
+
+class Conv3dResNet(nn.Module):
+    """Conv3dResNet module"""
+
+    def __init__(self, backbone_type="resnet", relu_type="swish"):
+        """__init__.
+        :param backbone_type: str, the type of a visual front-end.
+        :param relu_type: str, activation function used in an audio front-end.
+        """
+        super(Conv3dResNet, self).__init__()
+
+        self.backbone_type = backbone_type
+
+        self.frontend_nout = 64
+        self.trunk = ResNet(
+            BasicBlock,
+            [2, 2, 2, 2],
+            relu_type=relu_type,
+        )
+
+        # -- frontend3D
+        if relu_type == "relu":
+            frontend_relu = nn.ReLU(True)
+        elif relu_type == "prelu":
+            frontend_relu = nn.PReLU(self.frontend_nout)
+        elif relu_type == "swish":
+            frontend_relu = nn.SiLU(inplace=True)
+
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(
+                in_channels=1,
+                out_channels=self.frontend_nout,
+                kernel_size=(5, 7, 7),
+                stride=(1, 2, 2),
+                padding=(2, 3, 3),
+                bias=False,
+            ),
+            nn.BatchNorm3d(self.frontend_nout),
+            frontend_relu,
+            nn.MaxPool3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+            ),
+        )
+
+    def forward(self, xs_pad):
+        """forward.
+        :param xs_pad: torch.Tensor, batch of padded input sequences.
+        """
+        # -- include Channel dimension
+        xs_pad = xs_pad.transpose(2, 1)
+        B, C, T, H, W = xs_pad.size()
+        xs_pad = self.frontend3D(xs_pad)
+        Tnew = xs_pad.shape[2]  # outpu should be B x C2 x Tnew x H x W
+        xs_pad = threeD_to_2D_tensor(xs_pad)
+        xs_pad = self.trunk(xs_pad)
+        xs_pad = xs_pad.view(B, Tnew, xs_pad.size(1))
+        return xs_pad
+
+
+def video_resnet():
+    return Conv3dResNet()
diff --git a/examples/asr/avsr_rnnt/models/resnet1d.py b/examples/asr/avsr_rnnt/models/resnet1d.py
new file mode 100644
index 0000000000..f79f46bee5
--- /dev/null
+++ b/examples/asr/avsr_rnnt/models/resnet1d.py
@@ -0,0 +1,233 @@
+import torch.nn as nn
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """conv3x3.
+    :param in_planes: int, number of channels in the input sequence.
+    :param out_planes: int,  number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Conv1d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False,
+    )
+
+
+def downsample_basic_block(inplanes, outplanes, stride):
+    """downsample_basic_block.
+    :param inplanes: int, number of channels in the input sequence.
+    :param outplanes: int, number of channels produced by the convolution.
+    :param stride: int, size of the convolving kernel.
+    """
+    return nn.Sequential(
+        nn.Conv1d(
+            inplanes,
+            outplanes,
+            kernel_size=1,
+            stride=stride,
+            bias=False,
+        ),
+        nn.BatchNorm1d(outplanes),
+    )
+
+
+class BasicBlock1D(nn.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        relu_type="relu",
+    ):
+        """__init__.
+        :param inplanes: int, number of channels in the input sequence.
+        :param planes: int,  number of channels produced by the convolution.
+        :param stride: int, size of the convolving kernel.
+        :param downsample: boolean, if True, the temporal resolution is downsampled.
+        :param relu_type: str, type of activation function.
+        """
+        super(BasicBlock1D, self).__init__()
+
+        assert relu_type in ["relu", "prelu", "swish"]
+
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm1d(planes)
+
+        # type of ReLU is an input option
+        if relu_type == "relu":
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        elif relu_type == "swish":
+            self.relu1 = nn.SiLU(inplace=True)
+            self.relu2 = nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+        # --------
+
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm1d(planes)
+
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T)
+        """
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu2(out)
+
+        return out
+
+
+class ResNet1D(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        relu_type="swish",
+        a_upsample_ratio=1,
+    ):
+        """__init__.
+        :param block: torch.nn.Module, class of blocks.
+        :param layers: List, customised layers in each block.
+        :param relu_type: str, type of activation function.
+        :param a_upsample_ratio: int, The ratio related to the \
+            temporal resolution of output features of the frontend. \
+            a_upsample_ratio=1 produce features with a fps of 25.
+        """
+        super(ResNet1D, self).__init__()
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.downsample_block = downsample_basic_block
+        self.a_upsample_ratio = a_upsample_ratio
+
+        self.conv1 = nn.Conv1d(
+            in_channels=1,
+            out_channels=self.inplanes,
+            kernel_size=80,
+            stride=4,
+            padding=38,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm1d(self.inplanes)
+
+        if relu_type == "relu":
+            self.relu = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu = nn.PReLU(num_parameters=self.inplanes)
+        elif relu_type == "swish":
+            self.relu = nn.SiLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool1d(
+            kernel_size=20 // self.a_upsample_ratio,
+            stride=20 // self.a_upsample_ratio,
+        )
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        """_make_layer.
+        :param block: torch.nn.Module, class of blocks.
+        :param planes: int,  number of channels produced by the convolution.
+        :param blocks: int, number of layers in a block.
+        :param stride: int, size of the convolving kernel.
+        """
+
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block(
+                inplanes=self.inplanes,
+                outplanes=planes * block.expansion,
+                stride=stride,
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                relu_type=self.relu_type,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    relu_type=self.relu_type,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """forward.
+        :param x: torch.Tensor, input tensor with input size (B, C, T)
+        """
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        return x
+
+
+class Conv1dResNet(nn.Module):
+    """Conv1dResNet"""
+
+    def __init__(self, relu_type="swish", a_upsample_ratio=1):
+        """__init__.
+        :param relu_type: str, Activation function used in an audio front-end.
+        :param a_upsample_ratio: int, The ratio related to the \
+            temporal resolution of output features of the frontend. \
+            a_upsample_ratio=1 produce features with a fps of 25.
+        """
+
+        super(Conv1dResNet, self).__init__()
+        self.a_upsample_ratio = a_upsample_ratio
+        self.trunk = ResNet1D(BasicBlock1D, [2, 2, 2, 2], relu_type=relu_type, a_upsample_ratio=a_upsample_ratio)
+
+    def forward(self, xs_pad):
+        """forward.
+        :param xs_pad: torch.Tensor, batch of padded input sequences (B, Tmax, idim)
+        """
+        B, T, C = xs_pad.size()
+        xs_pad = xs_pad[:, : T // 640 * 640, :]
+        xs_pad = xs_pad.transpose(1, 2)
+        xs_pad = self.trunk(xs_pad)
+        # -- from B x C x T to B x T x C
+        xs_pad = xs_pad.transpose(1, 2)
+        return xs_pad
+
+
+def audio_resnet():
+    return Conv1dResNet()
diff --git a/examples/asr/avsr_rnnt/schedulers.py b/examples/asr/avsr_rnnt/schedulers.py
new file mode 100644
index 0000000000..b50329ec91
--- /dev/null
+++ b/examples/asr/avsr_rnnt/schedulers.py
@@ -0,0 +1,28 @@
+import math
+
+import torch
+
+
+class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        warmup_epochs: int,
+        total_epochs: int,
+        steps_per_epoch: int,
+        last_epoch=-1,
+        verbose=False,
+    ):
+        self.warmup_steps = warmup_epochs * steps_per_epoch
+        self.total_steps = total_epochs * steps_per_epoch
+        super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
+
+    def get_lr(self):
+        if self._step_count < self.warmup_steps:
+            return [self._step_count / self.warmup_steps * base_lr for base_lr in self.base_lrs]
+        else:
+            decay_steps = self.total_steps - self.warmup_steps
+            return [
+                0.5 * base_lr * (1 + math.cos(math.pi * (self._step_count - self.warmup_steps) / decay_steps))
+                for base_lr in self.base_lrs
+            ]
diff --git a/examples/asr/avsr_rnnt/train.py b/examples/asr/avsr_rnnt/train.py
new file mode 100644
index 0000000000..e609c43611
--- /dev/null
+++ b/examples/asr/avsr_rnnt/train.py
@@ -0,0 +1,140 @@
+import logging
+import os
+from argparse import ArgumentParser
+
+import sentencepiece as spm
+from average_checkpoints import ensemble
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
+from pytorch_lightning.strategies import DDPStrategy
+from transforms import get_data_module
+
+
+def get_trainer(args):
+    seed_everything(1)
+
+    checkpoint = ModelCheckpoint(
+        dirpath=os.path.join(args.exp_dir, args.experiment_name) if args.exp_dir else None,
+        monitor="monitoring_step",
+        mode="max",
+        save_last=True,
+        filename="{epoch}",
+        save_top_k=10,
+    )
+    lr_monitor = LearningRateMonitor(logging_interval="step")
+    callbacks = [
+        checkpoint,
+        lr_monitor,
+    ]
+    return Trainer(
+        sync_batchnorm=True,
+        default_root_dir=args.exp_dir,
+        max_epochs=args.epochs,
+        num_nodes=args.num_nodes,
+        devices=args.gpus,
+        accelerator="gpu",
+        strategy=DDPStrategy(find_unused_parameters=False),
+        callbacks=callbacks,
+        reload_dataloaders_every_n_epochs=1,
+        resume_from_checkpoint=args.resume_from_checkpoint,
+    )
+
+
+def get_lightning_module(args):
+    sp_model = spm.SentencePieceProcessor(model_file=str(args.sp_model_path))
+    if args.md == "av":
+        from lightning_av import AVConformerRNNTModule
+
+        model = AVConformerRNNTModule(args, sp_model)
+    else:
+        from lightning import ConformerRNNTModule
+
+        model = ConformerRNNTModule(args, sp_model)
+    return model
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--md",
+        type=str,
+        help="Modality",
+        required=True,
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        help="Perform online or offline recognition.",
+        required=True,
+    )
+    parser.add_argument(
+        "--root-dir",
+        type=str,
+        help="Root directory to LRS3 audio-visual datasets.",
+        required=True,
+    )
+    parser.add_argument(
+        "--sp-model-path",
+        type=str,
+        help="Path to SentencePiece model.",
+        required=True,
+    )
+    parser.add_argument(
+        "--pretrained-model-path",
+        type=str,
+        help="Path to Pretraned model.",
+    )
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        help="Directory to save checkpoints and logs to. (Default: './exp')",
+    )
+    parser.add_argument(
+        "--experiment-name",
+        type=str,
+        help="Experiment name",
+    )
+    parser.add_argument(
+        "--num-nodes",
+        default=8,
+        type=int,
+        help="Number of nodes to use for training. (Default: 8)",
+    )
+    parser.add_argument(
+        "--gpus",
+        default=8,
+        type=int,
+        help="Number of GPUs per node to use for training. (Default: 8)",
+    )
+    parser.add_argument(
+        "--epochs",
+        default=55,
+        type=int,
+        help="Number of epochs to train for. (Default: 55)",
+    )
+    parser.add_argument(
+        "--resume-from-checkpoint", default=None, type=str, help="Path to the checkpoint to resume from"
+    )
+    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
+    return parser.parse_args()
+
+
+def init_logger(debug):
+    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
+
+
+def cli_main():
+    args = parse_args()
+    init_logger(args.debug)
+    model = get_lightning_module(args)
+    data_module = get_data_module(args, str(args.sp_model_path))
+    trainer = get_trainer(args)
+    trainer.fit(model, data_module)
+
+    ensemble(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/examples/asr/avsr_rnnt/train_spm.py b/examples/asr/avsr_rnnt/train_spm.py
new file mode 100644
index 0000000000..f8dd1332be
--- /dev/null
+++ b/examples/asr/avsr_rnnt/train_spm.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""Trains a SentencePiece model on transcripts across LRS3 pretrain and trainval.
+
+- `[lrs3_path]` is the directory path for the LRS3 cropped face dataset.
+
+Example:
+python train_spm.py --lrs3-path [lrs3_path]
+"""
+
+import io
+import pathlib
+from argparse import ArgumentParser, RawTextHelpFormatter
+
+import sentencepiece as spm
+
+
+def get_transcript_text(transcript_path):
+    return [open(transcript_path).read().splitlines()[0].lower()]
+
+
+def get_transcripts(dataset_path):
+    transcript_paths = dataset_path.glob("*/*.txt")
+    merged_transcripts = []
+    for path in transcript_paths:
+        merged_transcripts += get_transcript_text(path)
+    return merged_transcripts
+
+
+def train_spm(input):
+    model_writer = io.BytesIO()
+    spm.SentencePieceTrainer.train(
+        sentence_iterator=iter(input),
+        model_writer=model_writer,
+        vocab_size=1023,
+        model_type="unigram",
+        input_sentence_size=-1,
+        character_coverage=1.0,
+        bos_id=0,
+        pad_id=1,
+        eos_id=2,
+        unk_id=3,
+    )
+    return model_writer.getvalue()
+
+
+def parse_args():
+    default_output_path = "./spm_unigram_1023.model"
+    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
+    parser.add_argument(
+        "--lrs3-path",
+        type=pathlib.Path,
+        help="Path to LRS3 datasets.",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-file",
+        default=pathlib.Path(default_output_path),
+        type=pathlib.Path,
+        help=f"File to save model to. (Default: '{default_output_path}')",
+    )
+    return parser.parse_args()
+
+
+def run_cli():
+    args = parse_args()
+
+    root = args.lrs3_path / "LRS3_text_seg16s"
+    splits = ["pretrain", "trainval"]
+    merged_transcripts = []
+    for split in splits:
+        path = pathlib.Path(root) / split
+        merged_transcripts += get_transcripts(path)
+    model = train_spm(merged_transcripts)
+
+    with open(args.output_file, "wb") as f:
+        f.write(model)
+
+
+if __name__ == "__main__":
+    run_cli()
diff --git a/examples/asr/avsr_rnnt/transforms.py b/examples/asr/avsr_rnnt/transforms.py
new file mode 100644
index 0000000000..d17c8307ab
--- /dev/null
+++ b/examples/asr/avsr_rnnt/transforms.py
@@ -0,0 +1,173 @@
+import random
+from typing import List
+
+import sentencepiece as spm
+import torch
+import torchvision
+from data_module import LRS3DataModule
+from lightning import Batch
+from lightning_av import AVBatch
+
+
+class FunctionalModule(torch.nn.Module):
+    def __init__(self, functional):
+        super().__init__()
+        self.functional = functional
+
+    def forward(self, input):
+        return self.functional(input)
+
+
+class AdaptiveTimeMask(torch.nn.Module):
+    def __init__(self, window, stride):
+        super().__init__()
+        self.window = window
+        self.stride = stride
+
+    def forward(self, x):
+        cloned = x.clone()
+        length = cloned.size(1)
+        n_mask = int((length + self.stride - 0.1) // self.stride)
+        ts = torch.randint(0, self.window, size=(n_mask, 2))
+        for t, t_end in ts:
+            if length - t <= 0:
+                continue
+            t_start = random.randrange(0, length - t)
+            if t_start == t_start + t:
+                continue
+            t_end += t_start
+            cloned[:, t_start:t_end] = 0
+        return cloned
+
+
+def _extract_labels(sp_model, samples: List):
+    targets = [sp_model.encode(sample[-1].lower()) for sample in samples]
+    lengths = torch.tensor([len(elem) for elem in targets]).to(dtype=torch.int32)
+    targets = torch.nn.utils.rnn.pad_sequence(
+        [torch.tensor(elem) for elem in targets],
+        batch_first=True,
+        padding_value=1.0,
+    ).to(dtype=torch.int32)
+    return targets, lengths
+
+
+def _extract_features(video_pipeline, audio_pipeline, samples, args):
+    raw_videos = []
+    raw_audios = []
+    for sample in samples:
+        if args.md == "v":
+            raw_videos.append(sample[0])
+        if args.md == "a":
+            raw_audios.append(sample[0])
+        if args.md == "av":
+            length = min(len(sample[0]) // 640, len(sample[1]))
+            raw_audios.append(sample[0][: length * 640])
+            raw_videos.append(sample[1][:length])
+
+    if args.md == "v" or args.md == "av":
+        videos = torch.nn.utils.rnn.pad_sequence(raw_videos, batch_first=True)
+        videos = video_pipeline(videos)
+        video_lengths = torch.tensor([elem.shape[0] for elem in videos], dtype=torch.int32)
+    if args.md == "a" or args.md == "av":
+        audios = torch.nn.utils.rnn.pad_sequence(raw_audios, batch_first=True)
+        audios = audio_pipeline(audios)
+        audio_lengths = torch.tensor([elem.shape[0] // 640 for elem in audios], dtype=torch.int32)
+    if args.md == "v":
+        return videos, video_lengths
+    if args.md == "a":
+        return audios, audio_lengths
+    if args.md == "av":
+        return audios, videos, audio_lengths, video_lengths
+
+
+class TrainTransform:
+    def __init__(self, sp_model_path: str, args):
+        self.args = args
+        self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path)
+        self.train_video_pipeline = torch.nn.Sequential(
+            FunctionalModule(lambda x: x / 255.0),
+            torchvision.transforms.RandomCrop(88),
+            torchvision.transforms.RandomHorizontalFlip(0.5),
+            FunctionalModule(lambda x: x.transpose(0, 1)),
+            torchvision.transforms.Grayscale(),
+            FunctionalModule(lambda x: x.transpose(0, 1)),
+            AdaptiveTimeMask(10, 25),
+            torchvision.transforms.Normalize(0.421, 0.165),
+        )
+        self.train_audio_pipeline = torch.nn.Sequential(
+            AdaptiveTimeMask(10, 25),
+        )
+
+    def __call__(self, samples: List):
+        targets, target_lengths = _extract_labels(self.sp_model, samples)
+        if self.args.md == "a":
+            audios, audio_lengths = _extract_features(
+                self.train_video_pipeline, self.train_audio_pipeline, samples, self.args
+            )
+            return Batch(audios, audio_lengths, targets, target_lengths)
+        if self.args.md == "v":
+            videos, video_lengths = _extract_features(
+                self.train_video_pipeline, self.train_audio_pipeline, samples, self.args
+            )
+            return Batch(videos, video_lengths, targets, target_lengths)
+        if self.args.md == "av":
+            audios, videos, audio_lengths, video_lengths = _extract_features(
+                self.train_video_pipeline, self.train_audio_pipeline, samples, self.args
+            )
+            return AVBatch(audios, videos, audio_lengths, video_lengths, targets, target_lengths)
+
+
+class ValTransform:
+    def __init__(self, sp_model_path: str, args):
+        self.args = args
+        self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path)
+        self.valid_video_pipeline = torch.nn.Sequential(
+            FunctionalModule(lambda x: x / 255.0),
+            torchvision.transforms.CenterCrop(88),
+            FunctionalModule(lambda x: x.transpose(0, 1)),
+            torchvision.transforms.Grayscale(),
+            FunctionalModule(lambda x: x.transpose(0, 1)),
+            torchvision.transforms.Normalize(0.421, 0.165),
+        )
+        self.valid_audio_pipeline = torch.nn.Sequential(
+            FunctionalModule(lambda x: x),
+        )
+
+    def __call__(self, samples: List):
+        targets, target_lengths = _extract_labels(self.sp_model, samples)
+        if self.args.md == "a":
+            audios, audio_lengths = _extract_features(
+                self.valid_video_pipeline, self.valid_audio_pipeline, samples, self.args
+            )
+            return Batch(audios, audio_lengths, targets, target_lengths)
+        if self.args.md == "v":
+            videos, video_lengths = _extract_features(
+                self.valid_video_pipeline, self.valid_audio_pipeline, samples, self.args
+            )
+            return Batch(videos, video_lengths, targets, target_lengths)
+        if self.args.md == "av":
+            audios, videos, audio_lengths, video_lengths = _extract_features(
+                self.valid_video_pipeline, self.valid_audio_pipeline, samples, self.args
+            )
+            return AVBatch(audios, videos, audio_lengths, video_lengths, targets, target_lengths)
+
+
+class TestTransform:
+    def __init__(self, sp_model_path: str, args):
+        self.val_transforms = ValTransform(sp_model_path, args)
+
+    def __call__(self, sample):
+        return self.val_transforms([sample]), [sample]
+
+
+def get_data_module(args, sp_model_path, max_frames=1800):
+    train_transform = TrainTransform(sp_model_path=sp_model_path, args=args)
+    val_transform = ValTransform(sp_model_path=sp_model_path, args=args)
+    test_transform = TestTransform(sp_model_path=sp_model_path, args=args)
+    return LRS3DataModule(
+        args=args,
+        train_transform=train_transform,
+        val_transform=val_transform,
+        test_transform=test_transform,
+        max_frames=max_frames,
+    )
diff --git a/examples/asr/emformer_rnnt/pipeline_demo.py b/examples/asr/emformer_rnnt/pipeline_demo.py
index 782bc1d539..eacba2503a 100644
--- a/examples/asr/emformer_rnnt/pipeline_demo.py
+++ b/examples/asr/emformer_rnnt/pipeline_demo.py
@@ -65,9 +65,9 @@ def run_eval_streaming(args):
             with torch.no_grad():
                 features, length = streaming_feature_extractor(segment)
                 hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
-            hypothesis = hypos[0]
-            transcript = token_processor(hypothesis[0], lstrip=False)
-            print(transcript, end="", flush=True)
+            hypothesis = hypos
+            transcript = token_processor(hypos[0][0], lstrip=True)
+            print(transcript, end="\r", flush=True)
         print()
 
         # Non-streaming decode.
diff --git a/examples/hubert/finetune.py b/examples/hubert/finetune.py
index 24008e2f2d..9e5d5be066 100644
--- a/examples/hubert/finetune.py
+++ b/examples/hubert/finetune.py
@@ -12,11 +12,10 @@
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, RawDescriptionHelpFormatter
 from typing import Tuple
 
-from lightning import HuBERTFineTuneModule
+from lightning.pytorch import seed_everything, Trainer
+from lightning.pytorch.callbacks import ModelCheckpoint
 
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.utilities.seed import seed_everything
+from lightning_modules import HuBERTFineTuneModule
 
 
 logger = logging.getLogger(__name__)
@@ -56,10 +55,10 @@ def run_train(args):
         default_root_dir=args.exp_dir,
         max_steps=args.max_updates,
         num_nodes=args.num_nodes,
-        gpus=args.gpus,
+        devices=args.gpus,
         accelerator="gpu",
-        strategy="ddp",
-        replace_sampler_ddp=False,
+        strategy="ddp_find_unused_parameters_true",
+        use_distributed_sampler=False,
         callbacks=callbacks,
         reload_dataloaders_every_n_epochs=1,
         val_check_interval=500,
diff --git a/examples/hubert/lightning.py b/examples/hubert/lightning_modules.py
similarity index 99%
rename from examples/hubert/lightning.py
rename to examples/hubert/lightning_modules.py
index f42163a5d5..67cb487b78 100644
--- a/examples/hubert/lightning.py
+++ b/examples/hubert/lightning_modules.py
@@ -14,8 +14,8 @@
     DistributedBatchSampler,
     HuBERTDataSet,
 )
+from lightning.pytorch import LightningModule
 from loss import hubert_loss
-from pytorch_lightning import LightningModule
 from torch import Tensor
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
diff --git a/examples/hubert/train.py b/examples/hubert/train.py
index b60a7fd11d..02f04e1320 100644
--- a/examples/hubert/train.py
+++ b/examples/hubert/train.py
@@ -9,10 +9,10 @@
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, RawDescriptionHelpFormatter
 from typing import Tuple
 
-from lightning import HuBERTPreTrainModule
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.utilities.seed import seed_everything
+from lightning.pytorch import seed_everything, Trainer
+from lightning.pytorch.callbacks import ModelCheckpoint
+
+from lightning_modules import HuBERTPreTrainModule
 
 
 logger = logging.getLogger(__name__)
@@ -52,10 +52,10 @@ def run_train(args):
         default_root_dir=args.exp_dir,
         max_steps=args.max_updates,
         num_nodes=args.num_nodes,
-        gpus=args.gpus,
+        devices=args.gpus,
         accelerator="gpu",
-        strategy="ddp",
-        replace_sampler_ddp=False,
+        strategy="ddp_find_unused_parameters_true",
+        use_distributed_sampler=False,
         callbacks=callbacks,
         reload_dataloaders_every_n_epochs=1,
     )
diff --git a/examples/pipeline_tacotron2/text/text_preprocessing.py b/examples/pipeline_tacotron2/text/text_preprocessing.py
index 04829d6b1b..db7f92baa7 100644
--- a/examples/pipeline_tacotron2/text/text_preprocessing.py
+++ b/examples/pipeline_tacotron2/text/text_preprocessing.py
@@ -70,8 +70,8 @@
 _phonemizer = None
 
 
-available_symbol_set = set(["english_characters", "english_phonemes"])
-available_phonemizers = set(["DeepPhonemizer"])
+available_symbol_set = {"english_characters", "english_phonemes"}
+available_phonemizers = {"DeepPhonemizer"}
 
 
 def get_symbol_list(symbol_list: str = "english_characters", cmudict_root: Optional[str] = "./") -> List[str]:
diff --git a/examples/self_supervised_learning/data_modules/_utils.py b/examples/self_supervised_learning/data_modules/_utils.py
index d862e3f496..97fbf9b1ae 100644
--- a/examples/self_supervised_learning/data_modules/_utils.py
+++ b/examples/self_supervised_learning/data_modules/_utils.py
@@ -9,7 +9,7 @@
 from torch import Tensor
 from torch.utils.data import BatchSampler, Dataset, DistributedSampler
 
-from ..lightning import Batch
+from ..lightning_modules import Batch
 
 
 class BucketizeBatchSampler(BatchSampler):
diff --git a/examples/self_supervised_learning/lightning.py b/examples/self_supervised_learning/lightning_modules.py
similarity index 98%
rename from examples/self_supervised_learning/lightning.py
rename to examples/self_supervised_learning/lightning_modules.py
index 866f7f2a7d..ca56f656da 100644
--- a/examples/self_supervised_learning/lightning.py
+++ b/examples/self_supervised_learning/lightning_modules.py
@@ -1,7 +1,7 @@
 from collections import namedtuple
 from typing import Callable, Optional
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.nn as nn
 from torch.optim.optimizer import Optimizer
diff --git a/examples/self_supervised_learning/train_hubert.py b/examples/self_supervised_learning/train_hubert.py
index a7899b7e04..839a0eb1dd 100644
--- a/examples/self_supervised_learning/train_hubert.py
+++ b/examples/self_supervised_learning/train_hubert.py
@@ -6,12 +6,11 @@
 
 import torch
 import torchaudio.models
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.utilities.seed import seed_everything
+from lightning.pytorch import seed_everything, Trainer
+from lightning.pytorch.callbacks import ModelCheckpoint
 
 from .data_modules import HuBERTDataModule
-from .lightning import SSLPretrainModule
+from .lightning_modules import SSLPretrainModule
 from .losses import hubert_loss
 from .lr_schedulers import LinearDecayLRScheduler
 
@@ -102,11 +101,11 @@ def run_train(args):
         num_nodes=args.num_nodes,
         devices=args.gpus,
         accelerator="gpu",
-        strategy="ddp",
+        strategy="ddp_find_unused_parameters_true",
         precision=args.precision,
         accumulate_grad_batches=args.accumulate_grad_batches,
         gradient_clip_val=args.clip_norm,
-        replace_sampler_ddp=False,
+        use_distributed_sampler=False,
         callbacks=callbacks,
         reload_dataloaders_every_n_epochs=1,
     )
diff --git a/examples/tutorials/additive_synthesis_tutorial.py b/examples/tutorials/additive_synthesis_tutorial.py
index 8cbad6de56..d6407f95bc 100644
--- a/examples/tutorials/additive_synthesis_tutorial.py
+++ b/examples/tutorials/additive_synthesis_tutorial.py
@@ -35,17 +35,14 @@
 #
 
 try:
-    from torchaudio.prototype.functional import (
-        oscillator_bank,
-        extend_pitch,
-        adsr_envelope,
-    )
+    from torchaudio.prototype.functional import adsr_envelope, extend_pitch, oscillator_bank
 except ModuleNotFoundError:
     print(
         "Failed to import prototype DSP features. "
         "Please install torchaudio nightly builds. "
         "Please refer to https://pytorch.org/get-started/locally "
-        "for instructions to install a nightly build.")
+        "for instructions to install a nightly build."
+    )
     raise
 
 import matplotlib.pyplot as plt
@@ -78,7 +75,7 @@
 PI = torch.pi
 PI2 = 2 * torch.pi
 
-F0 = 344.  # fundamental frequency
+F0 = 344.0  # fundamental frequency
 DURATION = 1.1  # [seconds]
 SAMPLE_RATE = 16_000  # [Hz]
 
@@ -87,26 +84,19 @@
 ######################################################################
 #
 
+
 def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.1):
     t = torch.arange(waveform.size(0)) / sample_rate
 
     fig, axes = plt.subplots(4, 1, sharex=True)
     axes[0].plot(t, freq)
-    axes[0].set(
-        title=f"Oscillator bank (bank size: {amp.size(-1)})",
-        ylabel="Frequency [Hz]",
-        ylim=[-0.03, None])
+    axes[0].set(title=f"Oscillator bank (bank size: {amp.size(-1)})", ylabel="Frequency [Hz]", ylim=[-0.03, None])
     axes[1].plot(t, amp)
-    axes[1].set(
-        ylabel="Amplitude",
-        ylim=[-0.03 if torch.all(amp >= 0.0) else None, None])
+    axes[1].set(ylabel="Amplitude", ylim=[-0.03 if torch.all(amp >= 0.0) else None, None])
     axes[2].plot(t, waveform)
     axes[2].set(ylabel="Waveform")
     axes[3].specgram(waveform, Fs=sample_rate)
-    axes[3].set(
-        ylabel="Spectrogram",
-        xlabel="Time [s]",
-        xlim=[-0.01, t[-1] + 0.01])
+    axes[3].set(ylabel="Spectrogram", xlabel="Time [s]", xlim=[-0.01, t[-1] + 0.01])
 
     for i in range(4):
         axes[i].grid(True)
@@ -121,6 +111,7 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.1):
     waveform /= waveform.abs().max()
     return Audio(vol * waveform, rate=sample_rate, normalize=False)
 
+
 ######################################################################
 # Harmonic Overtones
 # -------------------
@@ -159,10 +150,11 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.1):
 # and adds extend pitch in accordance with the formula above.
 #
 
+
 def sawtooth_wave(freq0, amp0, num_pitches, sample_rate):
     freq = extend_pitch(freq0, num_pitches)
 
-    mults = [-((-1) ** i) / (PI * i) for i in range(1, 1+num_pitches)]
+    mults = [-((-1) ** i) / (PI * i) for i in range(1, 1 + num_pitches)]
     amp = extend_pitch(amp0, mults)
     waveform = oscillator_bank(freq, amp, sample_rate=sample_rate)
     return freq, amp, waveform
@@ -176,7 +168,7 @@ def sawtooth_wave(freq0, amp0, num_pitches, sample_rate):
 freq0 = torch.full((NUM_FRAMES, 1), F0)
 amp0 = torch.ones((NUM_FRAMES, 1))
 freq, amp, waveform = sawtooth_wave(freq0, amp0, int(SAMPLE_RATE / F0), SAMPLE_RATE)
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 ######################################################################
 #
@@ -191,7 +183,7 @@ def sawtooth_wave(freq0, amp0, num_pitches, sample_rate):
 freq0 = F0 + f_dev * torch.sin(phase).unsqueeze(-1)
 
 freq, amp, waveform = sawtooth_wave(freq0, amp0, int(SAMPLE_RATE / F0), SAMPLE_RATE)
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 ######################################################################
 # Square wave
@@ -212,22 +204,23 @@ def sawtooth_wave(freq0, amp0, num_pitches, sample_rate):
 
 
 def square_wave(freq0, amp0, num_pitches, sample_rate):
-    mults = [2. * i + 1. for i in range(num_pitches)]
+    mults = [2.0 * i + 1.0 for i in range(num_pitches)]
     freq = extend_pitch(freq0, mults)
 
-    mults = [4 / (PI * (2. * i + 1.)) for i in range(num_pitches)]
+    mults = [4 / (PI * (2.0 * i + 1.0)) for i in range(num_pitches)]
     amp = extend_pitch(amp0, mults)
 
     waveform = oscillator_bank(freq, amp, sample_rate=sample_rate)
     return freq, amp, waveform
 
+
 ######################################################################
 #
 
 freq0 = torch.full((NUM_FRAMES, 1), F0)
 amp0 = torch.ones((NUM_FRAMES, 1))
-freq, amp, waveform = square_wave(freq0, amp0, int(SAMPLE_RATE/F0/2), SAMPLE_RATE)
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+freq, amp, waveform = square_wave(freq0, amp0, int(SAMPLE_RATE / F0 / 2), SAMPLE_RATE)
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 ######################################################################
 # Triangle wave
@@ -248,11 +241,11 @@ def square_wave(freq0, amp0, num_pitches, sample_rate):
 
 
 def triangle_wave(freq0, amp0, num_pitches, sample_rate):
-    mults = [2. * i + 1. for i in range(num_pitches)]
+    mults = [2.0 * i + 1.0 for i in range(num_pitches)]
     freq = extend_pitch(freq0, mults)
 
-    c = 8 / (PI ** 2)
-    mults = [c * ((-1) ** i) / ((2. * i + 1.) ** 2) for i in range(num_pitches)]
+    c = 8 / (PI**2)
+    mults = [c * ((-1) ** i) / ((2.0 * i + 1.0) ** 2) for i in range(num_pitches)]
     amp = extend_pitch(amp0, mults)
 
     waveform = oscillator_bank(freq, amp, sample_rate=sample_rate)
@@ -263,7 +256,7 @@ def triangle_wave(freq0, amp0, num_pitches, sample_rate):
 #
 
 freq, amp, waveform = triangle_wave(freq0, amp0, int(SAMPLE_RATE / F0 / 2), SAMPLE_RATE)
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 ######################################################################
 # Inharmonic Paritials
@@ -288,18 +281,18 @@ def triangle_wave(freq0, amp0, num_pitches, sample_rate):
 num_frames = int(SAMPLE_RATE * duration)
 
 freq0 = torch.full((num_frames, 1), F0)
-mults = [0.56, 0.92, 1.19, 1.71, 2, 2.74, 3., 3.76, 4.07]
+mults = [0.56, 0.92, 1.19, 1.71, 2, 2.74, 3.0, 3.76, 4.07]
 freq = extend_pitch(freq0, mults)
 
 amp = adsr_envelope(
     num_frames=num_frames,
     attack=0.002,
     decay=0.998,
-    sustain=0.,
-    release=0.,
+    sustain=0.0,
+    release=0.0,
     n_decay=2,
 )
-amp = torch.stack([amp * (0.5 ** i) for i in range(num_tones)], dim=-1)
+amp = torch.stack([amp * (0.5**i) for i in range(num_tones)], dim=-1)
 
 waveform = oscillator_bank(freq, amp, sample_rate=SAMPLE_RATE)
 
diff --git a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
index 1b454b612c..154f8589f7 100644
--- a/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
+++ b/examples/tutorials/asr_inference_with_ctc_decoder_tutorial.py
@@ -207,6 +207,7 @@
 
 class CustomLM(CTCDecoderLM):
     """Create a Python wrapper around `language_model` to feed to the decoder."""
+
     def __init__(self, language_model: torch.nn.Module):
         CTCDecoderLM.__init__(self)
         self.language_model = language_model
diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py
index 2675feefe4..cbe53b5326 100644
--- a/examples/tutorials/audio_data_augmentation_tutorial.py
+++ b/examples/tutorials/audio_data_augmentation_tutorial.py
@@ -20,6 +20,8 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
+import matplotlib.pyplot as plt
+
 ######################################################################
 # Preparation
 # -----------
@@ -27,10 +29,7 @@
 # First, we import the modules and download the audio assets we use in this tutorial.
 #
 
-import math
-
 from IPython.display import Audio
-import matplotlib.pyplot as plt
 
 from torchaudio.utils import download_asset
 
@@ -44,56 +43,38 @@
 # Applying effects and filtering
 # ------------------------------
 #
-# :py:func:`torchaudio.sox_effects` allows for directly applying filters similar to
-# those available in ``sox`` to Tensor objects and file object audio sources.
-#
-# There are two functions for this:
-#
-# -  :py:func:`torchaudio.sox_effects.apply_effects_tensor` for applying effects
-#    to Tensor.
-# -  :py:func:`torchaudio.sox_effects.apply_effects_file` for applying effects to
-#    other audio sources.
+# :py:class:`torchaudio.io.AudioEffector` allows for directly applying
+# filters and codecs to Tensor objects, in a similar way as ``ffmpeg``
+# command
 #
-# Both functions accept effect definitions in the form
-# ``List[List[str]]``.
-# This is mostly consistent with how ``sox`` command works, but one caveat is
-# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s
-# implementation does not.
-#
-# For the list of available effects, please refer to `the sox
-# documentation <http://sox.sourceforge.net/sox.html>`__.
-#
-# **Tip** If you need to load and resample your audio data on the fly,
-# then you can use :py:func:`torchaudio.sox_effects.apply_effects_file`
-# with effect ``"rate"``.
-#
-# **Note** :py:func:`torchaudio.sox_effects.apply_effects_file` accepts a
-# file-like object or path-like object.
-# Similar to :py:func:`torchaudio.load`, when the audio format cannot be
-# inferred from either the file extension or header, you can provide
-# argument ``format`` to specify the format of the audio source.
-#
-# **Note** This process is not differentiable.
+# `AudioEffector Usages <./effector_tutorial.html>` explains how to use
+# this class, so for the detail, please refer to the tutorial.
 #
 
 # Load the data
-waveform1, sample_rate1 = torchaudio.load(SAMPLE_WAV)
+waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False)
 
 # Define effects
-effects = [
-    ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
-    ["speed", "0.8"],  # reduce the speed
-    # This only changes sample rate, so it is necessary to
-    # add `rate` effect with original sample rate after this.
-    ["rate", f"{sample_rate1}"],
-    ["reverb", "-w"],  # Reverbration gives some dramatic feeling
-]
+effect = ",".join(
+    [
+        "lowpass=frequency=300:poles=1",  # apply single-pole lowpass filter
+        "atempo=0.8",  # reduce the speed
+        "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3"
+        # Applying echo gives some dramatic feeling
+    ],
+)
+
 
 # Apply effects
-waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(waveform1, sample_rate1, effects)
+def apply_effect(waveform, sample_rate, effect):
+    effector = torchaudio.io.AudioEffector(effect=effect)
+    return effector.apply(waveform, sample_rate)
+
 
-print(waveform1.shape, sample_rate1)
-print(waveform2.shape, sample_rate2)
+waveform2 = apply_effect(waveform1, sample_rate, effect)
+
+print(waveform1.shape, sample_rate)
+print(waveform2.shape, sample_rate)
 
 ######################################################################
 # Note that the number of frames and number of channels are different from
@@ -101,6 +82,7 @@
 # audio.
 #
 
+
 def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
     waveform = waveform.numpy()
 
@@ -120,9 +102,11 @@ def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
     figure.suptitle(title)
     plt.show(block=False)
 
+
 ######################################################################
 #
 
+
 def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
     waveform = waveform.numpy()
 
@@ -140,27 +124,25 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
     figure.suptitle(title)
     plt.show(block=False)
 
+
 ######################################################################
-# Original:
-# ~~~~~~~~~
+# Original
+# ~~~~~~~~
 #
 
-plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-0.1, 3.2))
-plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04))
-Audio(waveform1, rate=sample_rate1)
+plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2))
+plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04))
+Audio(waveform1.T, rate=sample_rate)
 
 ######################################################################
-# Effects applied:
-# ~~~~~~~~~~~~~~~~
+# Effects applied
+# ~~~~~~~~~~~~~~~
 #
 
-plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-0.1, 3.2))
-plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04))
-Audio(waveform2, rate=sample_rate2)
+plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2))
+plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04))
+Audio(waveform2.T, rate=sample_rate)
 
-######################################################################
-# Doesn’t it sound more dramatic?
-#
 
 ######################################################################
 # Simulating room reverberation
@@ -203,8 +185,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 augmented = F.fftconvolve(speech, rir)
 
 ######################################################################
-# Original:
-# ~~~~~~~~~
+# Original
+# ~~~~~~~~
 #
 
 plot_waveform(speech, sample_rate, title="Original")
@@ -212,8 +194,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 Audio(speech, rate=sample_rate)
 
 ######################################################################
-# RIR applied:
-# ~~~~~~~~~~~~
+# RIR applied
+# ~~~~~~~~~~~
 #
 
 plot_waveform(augmented, sample_rate, title="RIR Applied")
@@ -248,8 +230,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 
 
 ######################################################################
-# Background noise:
-# ~~~~~~~~~~~~~~~~~
+# Background noise
+# ~~~~~~~~~~~~~~~~
 #
 
 plot_waveform(noise, sample_rate, title="Background noise")
@@ -257,8 +239,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 Audio(noise, rate=sample_rate)
 
 ######################################################################
-# SNR 20 dB:
-# ~~~~~~~~~~
+# SNR 20 dB
+# ~~~~~~~~~
 #
 
 snr_db, noisy_speech = snr_dbs[0], noisy_speeches[0:1]
@@ -267,8 +249,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 Audio(noisy_speech, rate=sample_rate)
 
 ######################################################################
-# SNR 10 dB:
-# ~~~~~~~~~~
+# SNR 10 dB
+# ~~~~~~~~~
 #
 
 snr_db, noisy_speech = snr_dbs[1], noisy_speeches[1:2]
@@ -277,8 +259,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 Audio(noisy_speech, rate=sample_rate)
 
 ######################################################################
-# SNR 3 dB:
-# ~~~~~~~~~
+# SNR 3 dB
+# ~~~~~~~~
 #
 
 snr_db, noisy_speech = snr_dbs[2], noisy_speeches[2:3]
@@ -291,60 +273,56 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 # Applying codec to Tensor object
 # -------------------------------
 #
-# :py:func:`torchaudio.functional.apply_codec` can apply codecs to
+# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to
 # a Tensor object.
 #
-# **Note** This process is not differentiable.
-#
+
+waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False)
 
 
-waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)
+def apply_codec(waveform, sample_rate, format, encoder=None):
+    encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
+    return encoder.apply(waveform, sample_rate)
 
-configs = [
-    {"format": "wav", "encoding": "ULAW", "bits_per_sample": 8},
-    {"format": "gsm"},
-    {"format": "vorbis", "compression": -1},
-]
-waveforms = []
-for param in configs:
-    augmented = F.apply_codec(waveform, sample_rate, **param)
-    waveforms.append(augmented)
 
 ######################################################################
-# Original:
-# ~~~~~~~~~
+# Original
+# ~~~~~~~~
 #
 
-plot_waveform(waveform, sample_rate, title="Original")
-plot_specgram(waveform, sample_rate, title="Original")
-Audio(waveform, rate=sample_rate)
+plot_waveform(waveform.T, sample_rate, title="Original")
+plot_specgram(waveform.T, sample_rate, title="Original")
+Audio(waveform.T, rate=sample_rate)
 
 ######################################################################
-# 8 bit mu-law:
-# ~~~~~~~~~~~~~
+# 8 bit mu-law
+# ~~~~~~~~~~~~
 #
 
-plot_waveform(waveforms[0], sample_rate, title="8 bit mu-law")
-plot_specgram(waveforms[0], sample_rate, title="8 bit mu-law")
-Audio(waveforms[0], rate=sample_rate)
+mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw")
+plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law")
+plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law")
+Audio(mulaw.T, rate=sample_rate)
 
 ######################################################################
-# GSM-FR:
-# ~~~~~~~
+# G.722
+# ~~~~~
 #
 
-plot_waveform(waveforms[1], sample_rate, title="GSM-FR")
-plot_specgram(waveforms[1], sample_rate, title="GSM-FR")
-Audio(waveforms[1], rate=sample_rate)
+g722 = apply_codec(waveform, sample_rate, "g722")
+plot_waveform(g722.T, sample_rate, title="G.722")
+plot_specgram(g722.T, sample_rate, title="G.722")
+Audio(g722.T, rate=sample_rate)
 
 ######################################################################
-# Vorbis:
-# ~~~~~~~
+# Vorbis
+# ~~~~~~
 #
 
-plot_waveform(waveforms[2], sample_rate, title="Vorbis")
-plot_specgram(waveforms[2], sample_rate, title="Vorbis")
-Audio(waveforms[2], rate=sample_rate)
+vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis")
+plot_waveform(vorbis.T, sample_rate, title="Vorbis")
+plot_specgram(vorbis.T, sample_rate, title="Vorbis")
+Audio(vorbis.T, rate=sample_rate)
 
 ######################################################################
 # Simulating a phone recoding
@@ -378,62 +356,54 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
 plot_specgram(bg_added, sample_rate, title="BG noise added")
 
 # Apply filtering and change sample rate
-filtered, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor(
-    bg_added,
-    sample_rate,
-    effects=[
-        ["lowpass", "4000"],
-        [
-            "compand",
-            "0.02,0.05",
-            "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8",
-            "-8",
-            "-7",
-            "0.05",
-        ],
-        ["rate", "8000"],
-    ],
+effect = ",".join(
+    [
+        "lowpass=frequency=4000:poles=1",
+        "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05",
+    ]
 )
 
-plot_specgram(filtered, sample_rate2, title="Filtered")
+filtered = apply_effect(bg_added.T, sample_rate, effect)
+sample_rate2 = 8000
 
-# Apply telephony codec
-codec_applied = F.apply_codec(filtered, sample_rate2, format="gsm")
+plot_specgram(filtered.T, sample_rate2, title="Filtered")
 
-plot_specgram(codec_applied, sample_rate2, title="GSM Codec Applied")
+# Apply telephony codec
+codec_applied = apply_codec(filtered, sample_rate2, "g722")
+plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied")
 
 
 ######################################################################
-# Original speech:
-# ~~~~~~~~~~~~~~~~
+# Original speech
+# ~~~~~~~~~~~~~~~
 #
 
 Audio(original_speech, rate=sample_rate)
 
 ######################################################################
-# RIR applied:
-# ~~~~~~~~~~~~
+# RIR applied
+# ~~~~~~~~~~~
 #
 
 Audio(rir_applied, rate=sample_rate)
 
 ######################################################################
-# Background noise added:
-# ~~~~~~~~~~~~~~~~~~~~~~~
+# Background noise added
+# ~~~~~~~~~~~~~~~~~~~~~~
 #
 
 Audio(bg_added, rate=sample_rate)
 
 ######################################################################
-# Filtered:
-# ~~~~~~~~~
+# Filtered
+# ~~~~~~~~
 #
 
-Audio(filtered, rate=sample_rate2)
+Audio(filtered.T, rate=sample_rate2)
 
 ######################################################################
-# Codec applied:
-# ~~~~~~~~~~~~~~
+# Codec applied
+# ~~~~~~~~~~~~~
 #
 
-Audio(codec_applied, rate=sample_rate2)
+Audio(codec_applied.T, rate=sample_rate2)
diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py
index f71d424ade..63b71bc14a 100644
--- a/examples/tutorials/audio_feature_extractions_tutorial.py
+++ b/examples/tutorials/audio_feature_extractions_tutorial.py
@@ -25,6 +25,23 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
+import librosa
+import matplotlib.pyplot as plt
+
+######################################################################
+# Overview of audio features
+# --------------------------
+#
+# The following diagram shows the relationship between common audio features
+# and torchaudio APIs to generate them.
+#
+# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
+#
+# For the complete list of available features, please refer to the
+# documentation.
+#
+
+
 ######################################################################
 # Preparation
 # -----------
@@ -38,8 +55,7 @@
 #       !pip install librosa
 #
 from IPython.display import Audio
-import librosa
-import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
 from torchaudio.utils import download_asset
 
 torch.random.manual_seed(0)
@@ -47,26 +63,28 @@
 SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
 
 
-def plot_waveform(waveform, sr, title="Waveform"):
+def plot_waveform(waveform, sr, title="Waveform", ax=None):
     waveform = waveform.numpy()
 
     num_channels, num_frames = waveform.shape
     time_axis = torch.arange(0, num_frames) / sr
 
-    figure, axes = plt.subplots(num_channels, 1)
-    axes.plot(time_axis, waveform[0], linewidth=1)
-    axes.grid(True)
-    figure.suptitle(title)
+    if ax is None:
+        _, ax = plt.subplots(num_channels, 1)
+    ax.plot(time_axis, waveform[0], linewidth=1)
+    ax.grid(True)
+    ax.set_xlim([0, time_axis[-1]])
+    ax.set_title(title)
     plt.show(block=False)
 
 
-def plot_spectrogram(specgram, title=None, ylabel="freq_bin"):
-    fig, axs = plt.subplots(1, 1)
-    axs.set_title(title or "Spectrogram (db)")
-    axs.set_ylabel(ylabel)
-    axs.set_xlabel("frame")
-    im = axs.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto")
-    fig.colorbar(im, ax=axs)
+def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+    if title is not None:
+        ax.set_title(title)
+    ax.set_ylabel(ylabel)
+    ax.imshow(librosa.power_to_db(specgram), origin="lower", aspect="auto", interpolation="nearest")
     plt.show(block=False)
 
 
@@ -79,20 +97,6 @@ def plot_fbank(fbank, title=None):
     plt.show(block=False)
 
 
-######################################################################
-# Overview of audio features
-# --------------------------
-#
-# The following diagram shows the relationship between common audio features
-# and torchaudio APIs to generate them.
-#
-# .. image:: https://download.pytorch.org/torchaudio/tutorial-assets/torchaudio_feature_extractions.png
-#
-# For the complete list of available features, please refer to the
-# documentation.
-#
-
-
 ######################################################################
 # Spectrogram
 # -----------
@@ -101,77 +105,157 @@ def plot_fbank(fbank, title=None):
 # you can use :py:func:`torchaudio.transforms.Spectrogram`.
 #
 
+# Load audio
 SPEECH_WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_SPEECH)
 
-plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform")
-Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
+# Define transform
+spectrogram = T.Spectrogram(n_fft=512)
 
+# Perform transform
+spec = spectrogram(SPEECH_WAVEFORM)
 
 ######################################################################
 #
 
-n_fft = 1024
-win_length = None
-hop_length = 512
+fig, axs = plt.subplots(2, 1)
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original waveform", ax=axs[0])
+plot_spectrogram(spec[0], title="spectrogram", ax=axs[1])
+fig.tight_layout()
 
-# Define transform
-spectrogram = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-    center=True,
-    pad_mode="reflect",
-    power=2.0,
-)
+######################################################################
+#
+
+Audio(SPEECH_WAVEFORM.numpy(), rate=SAMPLE_RATE)
 
 ######################################################################
+# The effect of ``n_fft`` parameter
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The core of spectrogram computation is (short-term) Fourier transform,
+# and the ``n_fft`` parameter corresponds to the :math:`N` in the following
+# definition of descrete Fourier transform.
+#
+# $$ X_k = \\sum_{n=0}^{N-1} x_n e^{-\\frac{2\\pi i}{N} nk} $$
+#
+# (For the detail of Fourier transform, please refer to
+# `Wikipedia <https://en.wikipedia.org/wiki/Fast_Fourier_transform>`__.
+#
+# The value of ``n_fft`` determines the resolution of frequency axis.
+# However, with the higher ``n_fft`` value, the energy will be distributed
+# among more bins, so when you visualize it, it might look more blurry,
+# even thought they are higher resolution.
+#
+# The following illustrates this;
 #
 
-# Perform transform
-spec = spectrogram(SPEECH_WAVEFORM)
+######################################################################
+#
+# .. note::
+#
+#    ``hop_length`` determines the time axis resolution.
+#    By default, (i.e. ``hop_length=None`` and ``win_length=None``),
+#    the value of ``n_fft // 4`` is used.
+#    Here we use the same ``hop_length`` value across different ``n_fft``
+#    so that they have the same number of elemets in the time axis.
+#
+
+n_ffts = [32, 128, 512, 2048]
+hop_length = 64
+
+specs = []
+for n_fft in n_ffts:
+    spectrogram = T.Spectrogram(n_fft=n_fft, hop_length=hop_length)
+    spec = spectrogram(SPEECH_WAVEFORM)
+    specs.append(spec)
 
 ######################################################################
 #
 
-plot_spectrogram(spec[0], title="torchaudio")
+fig, axs = plt.subplots(len(specs), 1, sharex=True)
+for i, (spec, n_fft) in enumerate(zip(specs, n_ffts)):
+    plot_spectrogram(spec[0], ylabel=f"n_fft={n_fft}", ax=axs[i])
+    axs[i].set_xlabel(None)
+fig.tight_layout()
 
 ######################################################################
-# GriffinLim
-# ----------
 #
-# To recover a waveform from a spectrogram, you can use ``GriffinLim``.
+# When comparing signals, it is desirable to use the same sampling rate,
+# however if you must use the different sampling rate, care must be
+# taken for interpretating the meaning of ``n_fft``.
+# Recall that ``n_fft`` determines the resolution of the frequency
+# axis for a given sampling rate. In other words, what each bin on
+# the frequency axis represents is subject to the sampling rate.
 #
+# As we have seen above, changing the value of ``n_fft`` does not change
+# the coverage of frequency range for the same input signal.
 
-torch.random.manual_seed(0)
+######################################################################
+#
+# Let's downsample the audio and apply spectrogram with the same ``n_fft``
+# value.
 
-n_fft = 1024
-win_length = None
-hop_length = 512
+# Downsample to half of the original sample rate
+speech2 = torchaudio.functional.resample(SPEECH_WAVEFORM, SAMPLE_RATE, SAMPLE_RATE // 2)
+# Upsample to the original sample rate
+speech3 = torchaudio.functional.resample(speech2, SAMPLE_RATE // 2, SAMPLE_RATE)
 
-spec = T.Spectrogram(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)(SPEECH_WAVEFORM)
+######################################################################
+#
+
+# Apply the same spectrogram
+spectrogram = T.Spectrogram(n_fft=512)
+
+spec0 = spectrogram(SPEECH_WAVEFORM)
+spec2 = spectrogram(speech2)
+spec3 = spectrogram(speech3)
 
 ######################################################################
 #
 
-griffin_lim = T.GriffinLim(
-    n_fft=n_fft,
-    win_length=win_length,
-    hop_length=hop_length,
-)
+# Visualize it
+fig, axs = plt.subplots(3, 1)
+plot_spectrogram(spec0[0], ylabel="Original", ax=axs[0])
+axs[0].add_patch(Rectangle((0, 3), 212, 128, edgecolor="r", facecolor="none"))
+plot_spectrogram(spec2[0], ylabel="Downsampled", ax=axs[1])
+plot_spectrogram(spec3[0], ylabel="Upsampled", ax=axs[2])
+fig.tight_layout()
 
 ######################################################################
 #
+# In the above visualization, the second plot ("Downsampled") might
+# give the impression that the spectrogram is streched.
+# This is because the meaning of frequency bins is different from
+# the original one.
+# Even though, they have the same number of bins, in the second plot,
+# the frequency is only covered to the half of the original sampling
+# rate.
+# This becomes more clear if we resample the downsampled signal again
+# so that it has the same sample rate as the original.
+
+######################################################################
+# GriffinLim
+# ----------
+#
+# To recover a waveform from a spectrogram, you can use
+# :py:class:`torchaudio.transforms.GriffinLim`.
+#
+# The same set of parameters used for spectrogram must be used.
+
+# Define transforms
+n_fft = 1024
+spectrogram = T.Spectrogram(n_fft=n_fft)
+griffin_lim = T.GriffinLim(n_fft=n_fft)
 
+# Apply the transforms
+spec = spectrogram(SPEECH_WAVEFORM)
 reconstructed_waveform = griffin_lim(spec)
 
 ######################################################################
 #
 
-plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed")
+_, axes = plt.subplots(2, 1, sharex=True, sharey=True)
+plot_waveform(SPEECH_WAVEFORM, SAMPLE_RATE, title="Original", ax=axes[0])
+plot_waveform(reconstructed_waveform, SAMPLE_RATE, title="Reconstructed", ax=axes[1])
 Audio(reconstructed_waveform, rate=SAMPLE_RATE)
 
 ######################################################################
@@ -253,7 +337,6 @@ def plot_fbank(fbank, title=None):
     pad_mode="reflect",
     power=2.0,
     norm="slaney",
-    onesided=True,
     n_mels=n_mels,
     mel_scale="htk",
 )
@@ -322,7 +405,7 @@ def plot_fbank(fbank, title=None):
 ######################################################################
 #
 
-plot_spectrogram(mfcc[0])
+plot_spectrogram(mfcc[0], title="MFCC")
 
 ######################################################################
 # Comparison against librosa
@@ -350,7 +433,7 @@ def plot_fbank(fbank, title=None):
 ######################################################################
 #
 
-plot_spectrogram(mfcc_librosa)
+plot_spectrogram(mfcc_librosa, title="MFCC (librosa)")
 
 mse = torch.square(mfcc - mfcc_librosa).mean().item()
 print("Mean Square Difference: ", mse)
@@ -376,7 +459,7 @@ def plot_fbank(fbank, title=None):
 )
 
 lfcc = lfcc_transform(SPEECH_WAVEFORM)
-plot_spectrogram(lfcc[0])
+plot_spectrogram(lfcc[0], title="LFCC")
 
 ######################################################################
 # Pitch
@@ -388,6 +471,7 @@ def plot_fbank(fbank, title=None):
 ######################################################################
 #
 
+
 def plot_pitch(waveform, sr, pitch):
     figure, axis = plt.subplots(1, 1)
     axis.set_title("Pitch Feature")
@@ -406,54 +490,3 @@ def plot_pitch(waveform, sr, pitch):
 
 
 plot_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch)
-
-######################################################################
-# Kaldi Pitch (beta)
-# ------------------
-#
-# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
-# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
-# and it is available as :py:func:`torchaudio.functional.compute_kaldi_pitch`.
-#
-# 1. A pitch extraction algorithm tuned for automatic speech recognition
-#
-#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
-#    Khudanpur
-#
-#    2014 IEEE International Conference on Acoustics, Speech and Signal
-#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
-#    10.1109/ICASSP.2014.6854049.
-#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
-#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
-#
-
-pitch_feature = F.compute_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE)
-pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]
-
-######################################################################
-#
-
-def plot_kaldi_pitch(waveform, sr, pitch, nfcc):
-    _, axis = plt.subplots(1, 1)
-    axis.set_title("Kaldi Pitch Feature")
-    axis.grid(True)
-
-    end_time = waveform.shape[1] / sr
-    time_axis = torch.linspace(0, end_time, waveform.shape[1])
-    axis.plot(time_axis, waveform[0], linewidth=1, color="gray", alpha=0.3)
-
-    time_axis = torch.linspace(0, end_time, pitch.shape[1])
-    ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label="Pitch", color="green")
-    axis.set_ylim((-1.3, 1.3))
-
-    axis2 = axis.twinx()
-    time_axis = torch.linspace(0, end_time, nfcc.shape[1])
-    ln2 = axis2.plot(time_axis, nfcc[0], linewidth=2, label="NFCC", color="blue", linestyle="--")
-
-    lns = ln1 + ln2
-    labels = [l.get_label() for l in lns]
-    axis.legend(lns, labels, loc=0)
-    plt.show(block=False)
-
-
-plot_kaldi_pitch(SPEECH_WAVEFORM, SAMPLE_RATE, pitch, nfcc)
diff --git a/examples/tutorials/audio_io_tutorial.py b/examples/tutorials/audio_io_tutorial.py
index 514f985e48..6fd0f1f2e9 100644
--- a/examples/tutorials/audio_io_tutorial.py
+++ b/examples/tutorials/audio_io_tutorial.py
@@ -61,6 +61,7 @@ def __init__(self, obj):
 
         def read(self, n):
             return self.obj.read(n)
+
     return _wrapper(obj)
 
 
@@ -294,7 +295,8 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
 print("Fetching until the requested frames are available...")
 with requests.get(url, stream=True) as response:
     waveform2, sample_rate2 = torchaudio.load(
-        _hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames)
+        _hide_seek(response.raw), frame_offset=frame_offset, num_frames=num_frames
+    )
     print(f" - Fetched {response.raw.tell()} bytes")
 
 print("Checking the resulting waveform ... ", end="")
@@ -333,6 +335,7 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram"):
 ######################################################################
 #
 
+
 def inspect_file(path):
     print("-" * 10)
     print("Source:", path)
@@ -341,6 +344,7 @@ def inspect_file(path):
     print(f" - {torchaudio.info(path)}")
     print()
 
+
 ######################################################################
 #
 # Save without any encoding option.
diff --git a/examples/tutorials/audio_resampling_tutorial.py b/examples/tutorials/audio_resampling_tutorial.py
index ae50cbfc73..33b1ffec53 100644
--- a/examples/tutorials/audio_resampling_tutorial.py
+++ b/examples/tutorials/audio_resampling_tutorial.py
@@ -27,14 +27,14 @@
 import timeit
 
 import librosa
-import resampy
-import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
 import pandas as pd
-from IPython.display import Audio, display
+import resampy
+from IPython.display import Audio
 
-pd.set_option('display.max_rows', None)
-pd.set_option('display.max_columns', None)
+pd.set_option("display.max_rows", None)
+pd.set_option("display.max_columns", None)
 
 DEFAULT_OFFSET = 201
 
@@ -338,6 +338,7 @@ def plot_sweep(
 ######################################################################
 #
 
+
 def benchmark_resample_functional(
     waveform,
     sample_rate,
@@ -348,8 +349,9 @@ def benchmark_resample_functional(
     beta=None,
     iters=5,
 ):
-    return timeit.timeit(
-        stmt='''
+    return (
+        timeit.timeit(
+            stmt="""
 torchaudio.functional.resample(
     waveform,
     sample_rate,
@@ -359,16 +361,20 @@ def benchmark_resample_functional(
     resampling_method=resampling_method,
     beta=beta,
 )
-        ''',
-        setup='import torchaudio',
-        number=iters,
-        globals=locals(),
-    ) * 1000 / iters
+        """,
+            setup="import torchaudio",
+            number=iters,
+            globals=locals(),
+        )
+        * 1000
+        / iters
+    )
 
 
 ######################################################################
 #
 
+
 def benchmark_resample_transforms(
     waveform,
     sample_rate,
@@ -379,9 +385,10 @@ def benchmark_resample_transforms(
     beta=None,
     iters=5,
 ):
-    return timeit.timeit(
-        stmt='resampler(waveform)',
-        setup='''
+    return (
+        timeit.timeit(
+            stmt="resampler(waveform)",
+            setup="""
 import torchaudio
 
 resampler = torchaudio.transforms.Resample(
@@ -394,15 +401,19 @@ def benchmark_resample_transforms(
     beta=beta,
 )
 resampler.to(waveform.device)
-        ''',
-        number=iters,
-        globals=locals(),
-    ) * 1000 / iters
+        """,
+            number=iters,
+            globals=locals(),
+        )
+        * 1000
+        / iters
+    )
 
 
 ######################################################################
 #
 
+
 def benchmark_resample_librosa(
     waveform,
     sample_rate,
@@ -411,24 +422,29 @@ def benchmark_resample_librosa(
     iters=5,
 ):
     waveform_np = waveform.squeeze().numpy()
-    return timeit.timeit(
-        stmt='''
+    return (
+        timeit.timeit(
+            stmt="""
 librosa.resample(
     waveform_np,
     orig_sr=sample_rate,
     target_sr=resample_rate,
     res_type=res_type,
 )
-        ''',
-        setup='import librosa',
-        number=iters,
-        globals=locals(),
-    ) * 1000 / iters
+        """,
+            setup="import librosa",
+            number=iters,
+            globals=locals(),
+        )
+        * 1000
+        / iters
+    )
 
 
 ######################################################################
 #
 
+
 def benchmark(sample_rate, resample_rate):
     times, rows = [], []
     waveform = get_sine_sweep(sample_rate).to(torch.float32)
@@ -483,7 +499,7 @@ def plot(df):
     print(df.round(2))
     ax = df.plot(kind="bar")
     plt.ylabel("Time Elapsed [ms]")
-    plt.xticks(rotation = 0, fontsize=10)
+    plt.xticks(rotation=0, fontsize=10)
     for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
         label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
         ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
index 6c970e7654..a0d3d7acb7 100644
--- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py
+++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
@@ -9,8 +9,7 @@
 ``torchaudio``'s CTC forced alignment API proposed in the paper
 `“Scaling Speech Technology to 1,000+
 Languages” <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__,
-and two advanced usages, i.e. dealing with non-English data and
-transcription errors. 
+and one advanced usage, i.e. dealing with transcription errors with a <star> token.
 
 Though there’s some overlap in visualization
 diagrams, the scope here is different from the `“Forced Alignment with
@@ -39,17 +38,13 @@
         "Failed to import the forced alignment API. "
         "Please install torchaudio nightly builds. "
         "Please refer to https://pytorch.org/get-started/locally "
-        "for instructions to install a nightly build.")
+        "for instructions to install a nightly build."
+    )
     raise
 
-import matplotlib
-import matplotlib.pyplot as plt
-from IPython.display import Audio
-
-
 ######################################################################
-# I. Basic usages
-# ---------------
+# Basic usages
+# ------------
 #
 # In this section, we cover the following content:
 #
@@ -71,7 +66,10 @@
 
 # %matplotlib inline
 from dataclasses import dataclass
+
 import IPython
+import matplotlib
+import matplotlib.pyplot as plt
 
 matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
 
@@ -98,7 +96,7 @@
     emissions, _ = model(waveform.to(device))
     emissions = torch.log_softmax(emissions, dim=-1)
 
-emission = emissions[0].cpu().detach()
+emission = emissions.cpu().detach()
 dictionary = {c: i for i, c in enumerate(labels)}
 
 print(dictionary)
@@ -109,7 +107,7 @@
 # ^^^^^^^^^^^^^
 #
 
-plt.imshow(emission.T)
+plt.imshow(emission[0].T)
 plt.colorbar()
 plt.title("Frame-wise class probabilities")
 plt.xlabel("Time")
@@ -193,8 +191,6 @@
 # token-level and word-level alignments easily.
 #
 
-import torchaudio.functional as F
-
 
 @dataclass
 class Frame:
@@ -209,27 +205,27 @@ def compute_alignments(transcript, dictionary, emission):
     frames = []
     tokens = [dictionary[c] for c in transcript.replace(" ", "")]
 
-    targets = torch.tensor(tokens, dtype=torch.int32)
-    input_lengths = torch.tensor(emission.shape[0])
-    target_lengths = torch.tensor(targets.shape[0])
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
 
     # This is the key step, where we call the forced alignment API functional.forced_align to compute alignments.
-    frame_alignment, frame_scores = F.forced_align(emission, targets, input_lengths, target_lengths, 0)
+    frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
 
-    assert len(frame_alignment) == input_lengths.item()
-    assert len(targets) == target_lengths.item()
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
 
     token_index = -1
     prev_hyp = 0
-    for i in range(len(frame_alignment)):
-        if frame_alignment[i].item() == 0:
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
             prev_hyp = 0
             continue
 
-        if frame_alignment[i].item() != prev_hyp:
+        if frame_alignment[0][i].item() != prev_hyp:
             token_index += 1
-        frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
-        prev_hyp = frame_alignment[i].item()
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
     return frames, frame_alignment, frame_scores
 
 
@@ -255,6 +251,7 @@ def compute_alignments(transcript, dictionary, emission):
 # frame-level confidence scores.
 #
 
+
 # Merge the labels
 @dataclass
 class Segment:
@@ -279,8 +276,6 @@ def merge_repeats(frames, transcript):
         while i2 < len(frames) and frames[i1].token_index == frames[i2].token_index:
             i2 += 1
         score = sum(frames[k].score for k in range(i1, i2)) / (i2 - i1)
-        tokens = [dictionary[c] if c in dictionary else dictionary['@'] for c in transcript.replace(" ", "")]
-
         segments.append(
             Segment(
                 transcript_nospace[frames[i1].token_index],
@@ -370,7 +365,7 @@ def merge_words(transcript, segments, separator=" "):
                     s = len(words)
                 else:
                     s = 0
-                segs = segments[i1 + s:i2 + s]
+                segs = segments[i1 + s : i2 + s]
                 word = "".join([seg.label for seg in segs])
                 score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
                 words.append(Segment(word, segments[i1 + s].start, segments[i2 + s - 1].end, score))
@@ -380,6 +375,7 @@ def merge_words(transcript, segments, separator=" "):
         i3 += 1
     return words
 
+
 word_segments = merge_words(transcript, segments, "|")
 
 
@@ -388,12 +384,13 @@ def merge_words(transcript, segments, separator=" "):
 # ^^^^^^^^^^^^^
 #
 
+
 def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     fig, ax2 = plt.subplots(figsize=(64, 12))
-    plt.rcParams.update({'font.size': 30})
+    plt.rcParams.update({"font.size": 30})
 
     # The original waveform
-    ratio = waveform.size(0) / input_lengths
+    ratio = waveform.size(1) / input_lengths
     ax2.plot(waveform)
     ax2.set_ylim(-1.0 * scale, 1.0 * scale)
     ax2.set_xlim(0, waveform.size(-1))
@@ -413,11 +410,12 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     ax2.set_xlabel("time [second]", fontsize=40)
     ax2.set_yticks([])
 
+
 plot_alignments(
     segments,
     word_segments,
-    waveform[0],
-    emission.shape[0],
+    waveform,
+    emission.shape[1],
     1,
 )
 plt.show()
@@ -425,11 +423,12 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
 
 ######################################################################
 
+
 # A trick to embed the resulting audio to the generated file.
 # `IPython.display.Audio` has to be the last call in a cell,
 # and there should be only one call par cell.
 def display_segment(i, waveform, word_segments, frame_alignment):
-    ratio = waveform.size(1) / len(frame_alignment)
+    ratio = waveform.size(1) / frame_alignment.size(1)
     word = word_segments[i]
     x0 = int(ratio * word.start)
     x1 = int(ratio * word.end)
@@ -437,6 +436,7 @@ def display_segment(i, waveform, word_segments, frame_alignment):
     segment = waveform[:, x0:x1]
     return IPython.display.Audio(segment.numpy(), rate=sample_rate)
 
+
 # Generate the audio for each segment
 print(transcript)
 IPython.display.Audio(SPEECH_FILE)
@@ -488,178 +488,46 @@ def display_segment(i, waveform, word_segments, frame_alignment):
 
 
 ######################################################################
-# II. Advancd usages
-# ------------------
-#
-# Aligning non-English data
-# ~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Here we show an example of computing forced alignments on a German
-# utterance using the multilingual Wav2vec2 model described in the paper
-# `“Scaling Speech Technology to 1,000+
-# Languages” <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
-# The model was trained on 23K of audio data from 1100+ languages using
-# the `“uroman vocabulary” <https://www.isi.edu/~ulf/uroman.html>`__ as
-# targets.
-#
-
-from torchaudio.models import wav2vec2_model
-
-model = wav2vec2_model(
-    extractor_mode="layer_norm",
-    extractor_conv_layer_config=[
-        (512, 10, 5),
-        (512, 3, 2),
-        (512, 3, 2),
-        (512, 3, 2),
-        (512, 3, 2),
-        (512, 2, 2),
-        (512, 2, 2),
-    ],
-    extractor_conv_bias=True,
-    encoder_embed_dim=1024,
-    encoder_projection_dropout=0.0,
-    encoder_pos_conv_kernel=128,
-    encoder_pos_conv_groups=16,
-    encoder_num_layers=24,
-    encoder_num_heads=16,
-    encoder_attention_dropout=0.0,
-    encoder_ff_interm_features=4096,
-    encoder_ff_interm_dropout=0.1,
-    encoder_dropout=0.0,
-    encoder_layer_norm_first=True,
-    encoder_layer_drop=0.1,
-    aux_num_out=31,
-)
-
-torch.hub.download_url_to_file("https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt", "model.pt")
-checkpoint = torch.load("model.pt", map_location="cpu")
-
-model.load_state_dict(checkpoint)
-model.eval()
-
-
-waveform, _ = torchaudio.load(SPEECH_FILE)
-
-
-def get_emission(waveform):
-    # NOTE: this step is essential
-    waveform = torch.nn.functional.layer_norm(waveform, waveform.shape)
+# Advanced usage: Dealing with missing transcripts using the <star> token
+# ---------------------------------------------------------------------------
+#
+# Now let’s look at when the transcript is partially missing, how can we
+# improve alignment quality using the <star> token, which is capable of modeling
+# any token.
+#
+# Here we use the same English example as used above. But we remove the
+# beginning text “i had that curiosity beside me at” from the transcript.
+# Aligning audio with such transcript results in wrong alignments of the
+# existing word “this”. However, this issue can be mitigated by using the
+# <star> token to model the missing text.
+#
 
-    emissions, _ = model(waveform)
+# Reload the emission tensor in order to add the extra dimension corresponding to the <star> token.
+with torch.inference_mode():
+    waveform, _ = torchaudio.load(SPEECH_FILE)
+    emissions, _ = model(waveform.to(device))
     emissions = torch.log_softmax(emissions, dim=-1)
-    emission = emissions[0].cpu().detach()
 
     # Append the extra dimension corresponding to the <star> token
     extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
-    emissions = torch.cat((emissions, extra_dim), 2)
-    emission = emissions[0].cpu().detach()
-    return emission, waveform
-
-emission, waveform = get_emission(waveform)
-
-# Construct the dictionary
-# '@' represents the OOV token, '*' represents the <star> token.
-# <pad> and </s> are fairseq's legacy tokens, which're not used.
-dictionary = {
-    "<blank>": 0,
-    "<pad>": 1,
-    "</s>": 2,
-    "@": 3,
-    "a": 4,
-    "i": 5,
-    "e": 6,
-    "n": 7,
-    "o": 8,
-    "u": 9,
-    "t": 10,
-    "s": 11,
-    "r": 12,
-    "m": 13,
-    "k": 14,
-    "l": 15,
-    "d": 16,
-    "g": 17,
-    "h": 18,
-    "y": 19,
-    "b": 20,
-    "p": 21,
-    "w": 22,
-    "c": 23,
-    "v": 24,
-    "j": 25,
-    "z": 26,
-    "f": 27,
-    "'": 28,
-    "q": 29,
-    "x": 30,
-    "*": 31,
-}
-assert len(dictionary) == emission.shape[1]
+    emissions = torch.cat((emissions.cpu(), extra_dim), 2)
+    emission = emissions.detach()
+
+# Extend the dictionary to include the <star> token.
+dictionary["*"] = 29
+
+assert len(dictionary) == emission.shape[2]
 
 
 def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
     frames, frame_alignment, _ = compute_alignments(transcript, dictionary, emission)
     segments = merge_repeats(frames, transcript)
-    word_segments = merge_words(transcript, segments)
-    plot_alignments(
-        segments,
-        word_segments,
-        waveform[0],
-        emission.shape[0]
-    )
+    word_segments = merge_words(transcript, segments, "|")
+    plot_alignments(segments, word_segments, waveform, emission.shape[1], 1)
     plt.show()
     return word_segments, frame_alignment
 
-# One can follow the following steps to download the uroman romanizer and use it to obtain normalized transcripts.
-# def normalize_uroman(text):
-#     text = text.lower()
-#     text = text.replace("’", "'")
-#     text = re.sub("([^a-z' ])", " ", text)
-#     text = re.sub(' +', ' ', text)
-#     return text.strip()
-#
-# echo 'aber seit ich bei ihnen das brot hole brauch ich viel weniger schulze wandte sich ab die kinder taten ihm leid' > test.txt"
-# git clone https://github.com/isi-nlp/uroman
-# uroman/bin/uroman.pl < test.txt > test_romanized.txt
-# 
-# file = "test_romanized.txt"
-# f = open(file, "r")
-# lines = f.readlines()
-# text_normalized = normalize_uroman(lines[0].strip())
-
-
-text_normalized = "aber seit ich bei ihnen das brot hole brauch ich viel weniger schulze wandte sich ab die kinder taten ihm leid"
-SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/10349_8674_000087.flac")
-waveform, _ = torchaudio.load(SPEECH_FILE)
-
-emission, waveform = get_emission(waveform)
-
-transcript = text_normalized
-word_segments, frame_alignment = compute_and_plot_alignments(transcript, dictionary, emission, waveform)
-
 
-######################################################################
-# Dealing with missing transcripts using the <star> token
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# Now let’s look at when the transcript is partially missing, how can we
-# improve alignment quality using the <star> token, which is capable of modeling
-# any token. Note that in the above section, we have manually added the
-# token to the vocabualry and the emission matrix.
-#
-# Here we use the same English example as used above. But we remove the
-# beginning text “i had that curiosity beside me at” from the transcript.
-# Aligning audio with such transcript results in wrong alignments of the
-# existing word “this”. Using the OOV token “@” to model the missing text
-# doesn’t help (still resulting in wrong alignments for “this”). However,
-# this issue can be mitigated by using a <star> token to model the missing text.
-#
-
-SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-waveform, _ = torchaudio.load(SPEECH_FILE)
-emission, waveform = get_emission(waveform)
-transcript = "i had that curiosity beside me at this moment"
 # original:
 word_segments, frame_alignment = compute_and_plot_alignments(transcript, dictionary, emission, waveform)
 
@@ -667,19 +535,13 @@ def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
 
 # Demonstrate the effect of <star> token for dealing with deletion errors
 # ("i had that curiosity beside me at" missing from the transcript):
-transcript = "this moment"
-word_segments, frame_alignment = compute_and_plot_alignments(transcript, dictionary, emission, waveform)
-
-######################################################################
-
-# Replacing the missing transcript with the OOV token "@":
-transcript = "@ this moment"
+transcript = "THIS|MOMENT"
 word_segments, frame_alignment = compute_and_plot_alignments(transcript, dictionary, emission, waveform)
 
 ######################################################################
 
 # Replacing the missing transcript with the <star> token:
-transcript = "* this moment"
+transcript = "*|THIS|MOMENT"
 word_segments, frame_alignment = compute_and_plot_alignments(transcript, dictionary, emission, waveform)
 
 
@@ -688,9 +550,8 @@ def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
 # ----------
 #
 # In this tutorial, we looked at how to use torchaudio’s forced alignment
-# API and Wav2Vec2 pre-trained acoustic model to align and segment audio
-# files, and demonstrated two advanced usages: 1) Inference on non-English data
-# 2) How introducing a <star> token could improve alignment accuracy when
+# API to align and segment speech files, and demonstrated one advanced usage:
+# How introducing a <star> token could improve alignment accuracy when
 # transcription errors exist.
 #
 
diff --git a/examples/tutorials/device_asr.py b/examples/tutorials/device_asr.py
index 8c780ac7b9..b25ad4ae7b 100644
--- a/examples/tutorials/device_asr.py
+++ b/examples/tutorials/device_asr.py
@@ -10,11 +10,11 @@
 
 .. note::
 
-   This tutorial requires FFmpeg libraries (>=5.0, <6) and SentencePiece.
+   This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
 
    There are multiple ways to install FFmpeg libraries.
    If you are using Anaconda Python distribution,
-   ``conda install -c conda-forge 'ffmpeg<6'`` will install
+   ``conda install 'ffmpeg<4.4'`` will install
    the required FFmpeg libraries.
 
    You can install SentencePiece by running ``pip install sentencepiece``.
diff --git a/examples/tutorials/effector_tutorial.py b/examples/tutorials/effector_tutorial.py
index 5dbf723c66..64fe930d7d 100644
--- a/examples/tutorials/effector_tutorial.py
+++ b/examples/tutorials/effector_tutorial.py
@@ -13,11 +13,11 @@
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=5.0, <6).
+#    This tutorial requires FFmpeg libraries (>=4.1, <5).
 #
 #    There are multiple ways to install FFmpeg libraries.
 #    If you are using Anaconda Python distribution,
-#    ``conda install -c conda-forge 'ffmpeg<6'`` will install
+#    ``conda install -c anaconda 'ffmpeg<5'`` will install
 #    the required libraries.
 #
 
@@ -60,10 +60,7 @@
     for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
         print(k, v)
 except Exception:
-    raise RuntimeError(
-        "This tutorial requires FFmpeg libraries 4.2>,<5. "
-        "Please install FFmpeg."
-    )
+    raise RuntimeError("This tutorial requires FFmpeg libraries 4.2>,<5. " "Please install FFmpeg.")
 
 ######################################################################
 # Usage
@@ -107,11 +104,11 @@
 #
 
 
-def show(effect=None, format=None, *, stereo=False):
+def show(effect, *, stereo=False):
     wf = torch.cat([waveform] * 2, dim=1) if stereo else waveform
     figsize = (6.4, 2.1 if stereo else 1.2)
 
-    effector = AudioEffector(effect=effect, format=format, pad_end=False)
+    effector = AudioEffector(effect=effect, pad_end=False)
     result = effector.apply(wf, int(sr))
 
     num_channels = result.size(1)
@@ -128,7 +125,7 @@ def show(effect=None, format=None, *, stereo=False):
 # --------
 #
 
-show(effect=None, format=None)
+show(effect=None)
 
 ######################################################################
 # Effects
@@ -139,131 +136,138 @@ def show(effect=None, format=None, *, stereo=False):
 # tempo
 # ~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#atempo
-show(effect="atempo=0.7")
+show("atempo=0.7")
 
 ######################################################################
 #
-show(effect="atempo=1.8")
+show("atempo=1.8")
 
 ######################################################################
 # highpass
 # ~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#highpass
-show(effect="highpass=frequency=1500")
+show("highpass=frequency=1500")
 
 ######################################################################
 # lowpass
 # ~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#lowpass
-show(effect="lowpass=frequency=1000")
+show("lowpass=frequency=1000")
 
 ######################################################################
 # allpass
 # ~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#allpass
-show(effect="allpass")
+show("allpass")
 
 ######################################################################
 # bandpass
 # ~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#bandpass
-show(effect="bandpass=frequency=3000")
+show("bandpass=frequency=3000")
 
 ######################################################################
 # bandreject
 # ~~~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#bandreject
-show(effect="bandreject=frequency=3000")
+show("bandreject=frequency=3000")
 
 ######################################################################
 # echo
 # ~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#aecho
-show(effect="aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4")
+show("aecho=in_gain=0.8:out_gain=0.88:delays=6:decays=0.4")
 
 ######################################################################
 #
-show(effect="aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4")
+show("aecho=in_gain=0.8:out_gain=0.88:delays=60:decays=0.4")
 
 ######################################################################
 #
-show(effect="aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3")
+show("aecho=in_gain=0.8:out_gain=0.9:delays=1000:decays=0.3")
 
 ######################################################################
 # chorus
 # ~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#chorus
-show(effect=("chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3"))
+show("chorus=0.5:0.9:50|60|40:0.4|0.32|0.3:0.25|0.4|0.3:2|2.3|1.3")
 
 ######################################################################
 # fft filter
 # ~~~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#afftfilt
-show(effect=(
+
+# fmt: off
+show(
     "afftfilt="
     "real='re * (1-clip(b * (b/nb), 0, 1))':"
-    "imag='im * (1-clip(b * (b/nb), 0, 1))'"))
+    "imag='im * (1-clip(b * (b/nb), 0, 1))'"
+)
 
 ######################################################################
 #
-show(effect=(
+
+show(
     "afftfilt="
     "real='hypot(re,im) * sin(0)':"
     "imag='hypot(re,im) * cos(0)':"
     "win_size=512:"
-    "overlap=0.75"))
-
+    "overlap=0.75"
+)
 
 ######################################################################
 #
-show(effect=(
+
+show(
     "afftfilt="
     "real='hypot(re,im) * cos(2 * 3.14 * (random(0) * 2-1))':"
     "imag='hypot(re,im) * sin(2 * 3.14 * (random(1) * 2-1))':"
     "win_size=128:"
-    "overlap=0.8"))
+    "overlap=0.8"
+)
+# fmt: on
 
 ######################################################################
 # vibrato
 # ~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#vibrato
-show(effect=("vibrato=f=10:d=0.8"))
+show("vibrato=f=10:d=0.8")
 
 ######################################################################
 # tremolo
 # ~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#tremolo
-show(effect=("tremolo=f=8:d=0.8"))
+show("tremolo=f=8:d=0.8")
 
 ######################################################################
 # crystalizer
 # ~~~~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#crystalizer
-show(effect=("crystalizer"))
+show("crystalizer")
 
 ######################################################################
 # flanger
 # ~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#flanger
-show(effect=("flanger"))
+show("flanger")
 
 ######################################################################
 # phaser
 # ~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#aphaser
-show(effect=("aphaser"))
+show("aphaser")
 
 ######################################################################
 # pulsator
 # ~~~~~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#apulsator
-show(effect=("apulsator"), stereo=True)
+show("apulsator", stereo=True)
 
 ######################################################################
 # haas
 # ~~~~
 # https://ffmpeg.org/ffmpeg-filters.html#haas
-show(effect=("haas"))
+show("haas")
 
 ######################################################################
 # Codecs
@@ -292,11 +296,13 @@ def show_multi(configs):
 # ~~~
 #
 
-results = show_multi([
-    {"format": "ogg"},
-    {"format": "ogg", "encoder": "vorbis"},
-    {"format": "ogg", "encoder": "opus"},
-])
+results = show_multi(
+    [
+        {"format": "ogg"},
+        {"format": "ogg", "encoder": "vorbis"},
+        {"format": "ogg", "encoder": "opus"},
+    ]
+)
 
 ######################################################################
 # ogg - default encoder (flac)
@@ -321,15 +327,17 @@ def show_multi(configs):
 # ~~~
 # https://trac.ffmpeg.org/wiki/Encode/MP3
 
-results = show_multi([
-    {"format": "mp3"},
-    {"format": "mp3", "codec_config": CodecConfig(compression_level=1)},
-    {"format": "mp3", "codec_config": CodecConfig(compression_level=9)},
-    {"format": "mp3", "codec_config": CodecConfig(bit_rate=192_000)},
-    {"format": "mp3", "codec_config": CodecConfig(bit_rate=8_000)},
-    {"format": "mp3", "codec_config": CodecConfig(qscale=9)},
-    {"format": "mp3", "codec_config": CodecConfig(qscale=1)},
-])
+results = show_multi(
+    [
+        {"format": "mp3"},
+        {"format": "mp3", "codec_config": CodecConfig(compression_level=1)},
+        {"format": "mp3", "codec_config": CodecConfig(compression_level=9)},
+        {"format": "mp3", "codec_config": CodecConfig(bit_rate=192_000)},
+        {"format": "mp3", "codec_config": CodecConfig(bit_rate=8_000)},
+        {"format": "mp3", "codec_config": CodecConfig(qscale=9)},
+        {"format": "mp3", "codec_config": CodecConfig(qscale=1)},
+    ]
+)
 
 ######################################################################
 # default
diff --git a/examples/tutorials/filter_design_tutorial.py b/examples/tutorials/filter_design_tutorial.py
index c0581cd9d1..944a7df3f8 100644
--- a/examples/tutorials/filter_design_tutorial.py
+++ b/examples/tutorials/filter_design_tutorial.py
@@ -27,14 +27,11 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
+import matplotlib.pyplot as plt
+
 ######################################################################
 #
-from torchaudio.prototype.functional import (
-    sinc_impulse_response,
-    frequency_impulse_response,
-)
-
-import matplotlib.pyplot as plt
+from torchaudio.prototype.functional import frequency_impulse_response, sinc_impulse_response
 
 ######################################################################
 #
@@ -75,7 +72,7 @@
 # :py:func:`~torchaudio.prototype.functional.sinc_impulse_response`.
 #
 
-cutoff = torch.linspace(0., 1., 9)
+cutoff = torch.linspace(0.0, 1.0, 9)
 irs = sinc_impulse_response(cutoff, window_size=513)
 
 print("Cutoff shape:", cutoff.shape)
@@ -87,6 +84,7 @@
 # Let's visualize the resulting impulse responses.
 #
 
+
 def plot_sinc_ir(irs, cutoff):
     num_filts, window_size = irs.shape
     half = window_size // 2
@@ -99,7 +97,8 @@ def plot_sinc_ir(irs, cutoff):
         ax.grid(True)
     fig.suptitle(
         "Impulse response of sinc low-pass filter for different cut-off frequencies\n"
-        "(Frequencies are relative to Nyquist frequency)")
+        "(Frequencies are relative to Nyquist frequency)"
+    )
     axes[-1].set_xticks([i * half // 4 for i in range(-4, 5)])
     plt.tight_layout()
 
@@ -126,12 +125,12 @@ def plot_sinc_ir(irs, cutoff):
 # Let's visualize the resulting frequency responses.
 #
 
+
 def plot_sinc_fr(frs, cutoff, band=False):
     num_filts, num_fft = frs.shape
     num_ticks = num_filts + 1 if band else num_filts
 
-    fig, axes = plt.subplots(
-        num_filts, 1, sharex=True, sharey=True, figsize=(6.4, 4.8 * 1.5))
+    fig, axes = plt.subplots(num_filts, 1, sharex=True, sharey=True, figsize=(6.4, 4.8 * 1.5))
     for ax, fr, coff, color in zip(axes, frs, cutoff, plt.cm.tab10.colors):
         ax.grid(True)
         ax.semilogy(fr, color=color, zorder=4, label=f"Cutoff: {coff}")
@@ -141,11 +140,12 @@ def plot_sinc_fr(frs, cutoff, band=False):
         yticks=[1e-9, 1e-6, 1e-3, 1],
         xticks=torch.linspace(0, num_fft, num_ticks),
         xticklabels=[f"{i/(num_ticks - 1)}" for i in range(num_ticks)],
-        xlabel="Frequency"
+        xlabel="Frequency",
     )
     fig.suptitle(
         "Frequency response of sinc low-pass filter for different cut-off frequencies\n"
-        "(Frequencies are relative to Nyquist frequency)")
+        "(Frequencies are relative to Nyquist frequency)"
+    )
     plt.tight_layout()
 
 
@@ -193,13 +193,11 @@ def plot_sinc_fr(frs, cutoff, band=False):
 # Band-pass filter can be obtained by subtracting low-pass filter for
 # upper band from that of lower band.
 
-cutoff = torch.linspace(0., 1, 11)
+cutoff = torch.linspace(0.0, 1, 11)
 c_low = cutoff[:-1]
 c_high = cutoff[1:]
 
-irs = (
-    sinc_impulse_response(c_low, window_size=513)
-    - sinc_impulse_response(c_high, window_size=513))
+irs = sinc_impulse_response(c_low, window_size=513) - sinc_impulse_response(c_high, window_size=513)
 frs = torch.fft.rfft(irs, n=2048, dim=1).abs()
 
 ######################################################################
@@ -256,6 +254,7 @@ def plot_sinc_fr(frs, cutoff, band=False):
 ######################################################################
 #
 
+
 def plot_ir(magnitudes, ir, num_fft=2048):
     fr = torch.fft.rfft(ir, n=num_fft, dim=0).abs()
     ir_size = ir.size(-1)
@@ -268,17 +267,18 @@ def plot_ir(magnitudes, ir, num_fft=2048):
     axes[0].set(title="Impulse Response")
     axes[0].set_xticks([i * half // 4 for i in range(-4, 5)])
     t = torch.linspace(0, 1, fr.numel())
-    axes[1].plot(t, fr, label='Actual')
-    axes[2].semilogy(t, fr, label='Actual')
+    axes[1].plot(t, fr, label="Actual")
+    axes[2].semilogy(t, fr, label="Actual")
     t = torch.linspace(0, 1, magnitudes.numel())
     for i in range(1, 3):
-        axes[i].plot(t, magnitudes, label='Desired (input)', linewidth=1.1, linestyle='--')
+        axes[i].plot(t, magnitudes, label="Desired (input)", linewidth=1.1, linestyle="--")
         axes[i].grid(True)
     axes[1].set(title="Frequency Response")
     axes[2].set(title="Frequency Response (log-scale)", xlabel="Frequency")
     axes[2].legend(loc="lower right")
     fig.tight_layout()
 
+
 ######################################################################
 #
 
@@ -305,7 +305,7 @@ def plot_ir(magnitudes, ir, num_fft=2048):
 #
 #
 
-magnitudes = torch.linspace(0, 1, 64)**4.0
+magnitudes = torch.linspace(0, 1, 64) ** 4.0
 ir = frequency_impulse_response(magnitudes)
 
 
@@ -316,7 +316,7 @@ def plot_ir(magnitudes, ir, num_fft=2048):
 
 ######################################################################
 #
-magnitudes = torch.sin(torch.linspace(0, 10, 64))**4.0
+magnitudes = torch.sin(torch.linspace(0, 10, 64)) ** 4.0
 ir = frequency_impulse_response(magnitudes)
 
 
diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
new file mode 100644
index 0000000000..01333d7175
--- /dev/null
+++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
@@ -0,0 +1,906 @@
+"""
+Forced alignment for multilingual data
+======================================
+
+**Author**: `Xiaohui Zhang <xiaohuizhang@meta.com>`__
+
+This tutorial shows how to compute forced alignments for speech data
+from multiple non-English languages using ``torchaudio``'s CTC forced alignment
+API described in `“CTC forced alignment
+tutorial” <https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html>`__
+and the multilingual Wav2vec2 model proposed in the paper `“Scaling
+Speech Technology to 1,000+
+Languages” <https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/>`__.
+The model was trained on 23K of audio data from 1100+ languages using
+the `“uroman vocabulary” <https://www.isi.edu/~ulf/uroman.html>`__
+as targets.
+
+"""
+
+import torch
+import torchaudio
+
+print(torch.__version__)
+print(torchaudio.__version__)
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+
+
+try:
+    from torchaudio.functional import forced_align
+except ModuleNotFoundError:
+    print(
+        "Failed to import the forced alignment API. "
+        "Please install torchaudio nightly builds. "
+        "Please refer to https://pytorch.org/get-started/locally "
+        "for instructions to install a nightly build."
+    )
+    raise
+
+######################################################################
+# Preparation
+# -----------
+#
+# Here we import necessary packages, and define utility functions for
+# computing the frame-level alignments (using the API
+# ``functional.forced_align``), token-level and word-level alignments, and
+# also alignment visualization utilities.
+#
+
+# %matplotlib inline
+from dataclasses import dataclass
+
+import IPython
+
+import matplotlib.pyplot as plt
+
+torch.random.manual_seed(0)
+
+sample_rate = 16000
+
+
+@dataclass
+class Frame:
+    # This is the index of each token in the transcript,
+    # i.e. the current frame aligns to the N-th character from the transcript.
+    token_index: int
+    time_index: int
+    score: float
+
+
+@dataclass
+class Segment:
+    label: str
+    start: int
+    end: int
+    score: float
+
+    def __repr__(self):
+        return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"
+
+    @property
+    def length(self):
+        return self.end - self.start
+
+
+# compute frame-level and word-level alignments using torchaudio's forced alignment API
+def compute_alignments(transcript, dictionary, emission):
+    frames = []
+    tokens = [dictionary[c] for c in transcript.replace(" ", "")]
+
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
+
+    # This is the key step, where we call the forced alignment API functional.forced_align to compute frame alignments.
+    frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
+
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
+
+    token_index = -1
+    prev_hyp = 0
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
+            prev_hyp = 0
+            continue
+
+        if frame_alignment[0][i].item() != prev_hyp:
+            token_index += 1
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
+
+    # compute frame alignments from token alignments
+    transcript_nospace = transcript.replace(" ", "")
+    i1, i2 = 0, 0
+    segments = []
+    while i1 < len(frames):
+        while i2 < len(frames) and frames[i1].token_index == frames[i2].token_index:
+            i2 += 1
+        score = sum(frames[k].score for k in range(i1, i2)) / (i2 - i1)
+
+        segments.append(
+            Segment(
+                transcript_nospace[frames[i1].token_index],
+                frames[i1].time_index,
+                frames[i2 - 1].time_index + 1,
+                score,
+            )
+        )
+        i1 = i2
+
+    # compue word alignments from token alignments
+    separator = " "
+    words = []
+    i1, i2, i3 = 0, 0, 0
+    while i3 < len(transcript):
+        if i3 == len(transcript) - 1 or transcript[i3] == separator:
+            if i1 != i2:
+                if i3 == len(transcript) - 1:
+                    i2 += 1
+                s = 0
+                segs = segments[i1 + s : i2 + s]
+                word = "".join([seg.label for seg in segs])
+                score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
+                words.append(Segment(word, segments[i1 + s].start, segments[i2 + s - 1].end, score))
+            i1 = i2
+        else:
+            i2 += 1
+        i3 += 1
+
+    num_frames = frame_alignment.shape[1]
+    return segments, words, num_frames
+
+
+# utility function for plotting word alignments
+def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
+    fig, ax2 = plt.subplots(figsize=(64, 12))
+    plt.rcParams.update({"font.size": 30})
+
+    # The original waveform
+    ratio = waveform.size(1) / input_lengths
+    ax2.plot(waveform)
+    ax2.set_ylim(-1.0 * scale, 1.0 * scale)
+    ax2.set_xlim(0, waveform.size(-1))
+
+    for word in word_segments:
+        x0 = ratio * word.start
+        x1 = ratio * word.end
+        ax2.axvspan(x0, x1, alpha=0.1, color="red")
+        ax2.annotate(f"{word.score:.2f}", (x0, 0.8 * scale))
+
+    for seg in segments:
+        if seg.label != "|":
+            ax2.annotate(seg.label, (seg.start * ratio, 0.9 * scale))
+
+    xticks = ax2.get_xticks()
+    plt.xticks(xticks, xticks / sample_rate, fontsize=50)
+    ax2.set_xlabel("time [second]", fontsize=40)
+    ax2.set_yticks([])
+
+
+# utility function for playing audio segments.
+# A trick to embed the resulting audio to the generated file.
+# `IPython.display.Audio` has to be the last call in a cell,
+# and there should be only one call par cell.
+def display_segment(i, waveform, word_segments, num_frames):
+    ratio = waveform.size(1) / num_frames
+    word = word_segments[i]
+    x0 = int(ratio * word.start)
+    x1 = int(ratio * word.end)
+    print(f"{word.label} ({word.score:.2f}): {x0 / sample_rate:.3f} - {x1 / sample_rate:.3f} sec")
+    segment = waveform[:, x0:x1]
+    return IPython.display.Audio(segment.numpy(), rate=sample_rate)
+
+
+######################################################################
+# Aligning multilingual data
+# --------------------------
+#
+# Here we show examples of computing forced alignments of utterances in
+# 5 languages using the multilingual Wav2vec2 model, with the alignments visualized.
+# One can also play the whole audio and audio segments aligned with each word, in
+# order to verify the alignment quality. Here we first load the model and dictionary.
+#
+
+from torchaudio.models import wav2vec2_model
+
+model = wav2vec2_model(
+    extractor_mode="layer_norm",
+    extractor_conv_layer_config=[
+        (512, 10, 5),
+        (512, 3, 2),
+        (512, 3, 2),
+        (512, 3, 2),
+        (512, 3, 2),
+        (512, 2, 2),
+        (512, 2, 2),
+    ],
+    extractor_conv_bias=True,
+    encoder_embed_dim=1024,
+    encoder_projection_dropout=0.0,
+    encoder_pos_conv_kernel=128,
+    encoder_pos_conv_groups=16,
+    encoder_num_layers=24,
+    encoder_num_heads=16,
+    encoder_attention_dropout=0.0,
+    encoder_ff_interm_features=4096,
+    encoder_ff_interm_dropout=0.1,
+    encoder_dropout=0.0,
+    encoder_layer_norm_first=True,
+    encoder_layer_drop=0.1,
+    aux_num_out=31,
+)
+
+
+model.load_state_dict(
+    torch.hub.load_state_dict_from_url(
+        "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt"
+    )
+)
+model.eval()
+
+
+def get_emission(waveform):
+    # NOTE: this step is essential
+    waveform = torch.nn.functional.layer_norm(waveform, waveform.shape)
+
+    emissions, _ = model(waveform)
+    emissions = torch.log_softmax(emissions, dim=-1)
+    emission = emissions.cpu().detach()
+
+    # Append the extra dimension corresponding to the <star> token
+    extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
+    emissions = torch.cat((emissions.cpu(), extra_dim), 2)
+    emission = emissions.detach()
+    return emission, waveform
+
+
+# Construct the dictionary
+# '@' represents the OOV token, '*' represents the <star> token.
+# <pad> and </s> are fairseq's legacy tokens, which're not used.
+dictionary = {
+    "<blank>": 0,
+    "<pad>": 1,
+    "</s>": 2,
+    "@": 3,
+    "a": 4,
+    "i": 5,
+    "e": 6,
+    "n": 7,
+    "o": 8,
+    "u": 9,
+    "t": 10,
+    "s": 11,
+    "r": 12,
+    "m": 13,
+    "k": 14,
+    "l": 15,
+    "d": 16,
+    "g": 17,
+    "h": 18,
+    "y": 19,
+    "b": 20,
+    "p": 21,
+    "w": 22,
+    "c": 23,
+    "v": 24,
+    "j": 25,
+    "z": 26,
+    "f": 27,
+    "'": 28,
+    "q": 29,
+    "x": 30,
+    "*": 31,
+}
+
+
+######################################################################
+# Before aligning the speech with transcripts, we need to make sure
+# the transcripts are already romanized. Here are the BASH commands
+# required for saving raw transcript to a file, downloading the uroman
+# romanizer and using it to obtain romanized transcripts, and PyThon
+# commands required for further normalizing the romanized transcript.
+#
+
+# %%
+# .. code-block:: bash
+#
+#    %%bash
+#    Save the raw transcript to a file
+#    echo 'raw text' > text.txt
+#    git clone https://github.com/isi-nlp/uroman
+#    uroman/bin/uroman.pl < text.txt > text_romanized.txt
+#
+
+######################################################################
+# .. code-block:: python
+#
+#    import re
+#    def normalize_uroman(text):
+#        text = text.lower()
+#        text = text.replace("’", "'")
+#        text = re.sub("([^a-z' ])", " ", text)
+#        text = re.sub(' +', ' ', text)
+#        return text.strip()
+#
+#    file = "text_romanized.txt"
+#    f = open(file, "r")
+#    lines = f.readlines()
+#    text_normalized = normalize_uroman(lines[0].strip())
+#
+
+
+######################################################################
+# German example:
+# ~~~~~~~~~~~~~~~~
+
+text_raw = (
+    "aber seit ich bei ihnen das brot hole brauch ich viel weniger schulze wandte sich ab die kinder taten ihm leid"
+)
+text_normalized = (
+    "aber seit ich bei ihnen das brot hole brauch ich viel weniger schulze wandte sich ab die kinder taten ihm leid"
+)
+speech_file = torchaudio.utils.download_asset("tutorial-assets/10349_8674_000087.flac", progress=False)
+waveform, _ = torchaudio.load(speech_file)
+
+emission, waveform = get_emission(waveform)
+assert len(dictionary) == emission.shape[2]
+
+transcript = text_normalized
+
+segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
+
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+
+######################################################################
+#
+
+display_segment(0, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(1, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(2, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(3, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(4, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(5, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(6, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(7, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(8, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(9, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(10, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(11, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(12, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(13, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(14, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(15, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(16, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(17, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(18, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(19, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(20, waveform, word_segments, num_frames)
+
+
+######################################################################
+# Chinese example:
+# ~~~~~~~~~~~~~~~~
+#
+# Chinese is a character-based language, and there is not explicit word-level
+# tokenization (separated by spaces) in its raw written form. In order to
+# obtain word level alignments, you need to first tokenize the transcripts
+# at the word level using a word tokenizer like `“Stanford
+# Tokenizer” <https://michelleful.github.io/code-blog/2015/09/10/parsing-chinese-with-stanford/>`__.
+# However this is not needed if you only want character-level alignments.
+#
+
+text_raw = "关 服务 高端 产品 仍 处于 供不应求 的 局面"
+text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
+speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False)
+waveform, _ = torchaudio.load(speech_file)
+waveform = waveform[0:1]
+
+emission, waveform = get_emission(waveform)
+
+transcript = text_normalized
+
+segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
+
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+
+######################################################################
+#
+
+display_segment(0, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(1, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(2, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(3, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(4, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(5, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(6, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(7, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(8, waveform, word_segments, num_frames)
+
+
+######################################################################
+# Polish example:
+# ~~~~~~~~~~~~~~~
+
+
+text_raw = "wtedy ujrzałem na jego brzuchu okrągłą czarną ranę dlaczego mi nie powiedziałeś szepnąłem ze łzami"
+text_normalized = "wtedy ujrzalem na jego brzuchu okragla czarna rane dlaczego mi nie powiedziales szepnalem ze lzami"
+speech_file = torchaudio.utils.download_asset("tutorial-assets/5090_1447_000088.flac", progress=False)
+waveform, _ = torchaudio.load(speech_file)
+
+emission, waveform = get_emission(waveform)
+
+transcript = text_normalized
+
+segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
+
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+
+######################################################################
+#
+
+display_segment(0, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(1, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(2, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(3, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(4, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(5, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(6, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(7, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(8, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(9, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(10, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(11, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(12, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(13, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(14, waveform, word_segments, num_frames)
+
+
+######################################################################
+# Portuguese example:
+# ~~~~~~~~~~~~~~~~~~~
+
+
+text_raw = (
+    "mas na imensa extensão onde se esconde o inconsciente imortal só me responde um bramido um queixume e nada mais"
+)
+text_normalized = (
+    "mas na imensa extensao onde se esconde o inconsciente imortal so me responde um bramido um queixume e nada mais"
+)
+speech_file = torchaudio.utils.download_asset("tutorial-assets/6566_5323_000027.flac", progress=False)
+waveform, _ = torchaudio.load(speech_file)
+
+emission, waveform = get_emission(waveform)
+
+transcript = text_normalized
+
+segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
+
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+
+######################################################################
+#
+
+display_segment(0, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(1, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(2, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(3, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(4, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(5, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(6, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(7, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(8, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(9, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(10, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(11, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(12, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(13, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(14, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(15, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(16, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(17, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(18, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(19, waveform, word_segments, num_frames)
+
+
+######################################################################
+# Italian example:
+# ~~~~~~~~~~~~~~~~
+
+text_raw = "elle giacean per terra tutte quante fuor d'una ch'a seder si levò ratto ch'ella ci vide passarsi davante"
+text_normalized = (
+    "elle giacean per terra tutte quante fuor d'una ch'a seder si levo ratto ch'ella ci vide passarsi davante"
+)
+speech_file = torchaudio.utils.download_asset("tutorial-assets/642_529_000025.flac", progress=False)
+waveform, _ = torchaudio.load(speech_file)
+
+emission, waveform = get_emission(waveform)
+
+transcript = text_normalized
+
+segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
+
+print("Raw Transcript: ", text_raw)
+print("Normalized Transcript: ", text_normalized)
+IPython.display.Audio(waveform, rate=sample_rate)
+
+######################################################################
+#
+
+display_segment(0, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(1, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(2, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(3, waveform, word_segments, num_frames)
+
+
+######################################################################
+#
+
+display_segment(4, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(5, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(6, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(7, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(8, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(9, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(10, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(11, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(12, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(13, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(14, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(15, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(16, waveform, word_segments, num_frames)
+
+######################################################################
+#
+
+display_segment(17, waveform, word_segments, num_frames)
+
+
+######################################################################
+# Conclusion
+# ----------
+#
+# In this tutorial, we looked at how to use torchaudio’s forced alignment
+# API and a Wav2Vec2 pre-trained mulilingual acoustic model to align
+# speech data to transcripts in five languages.
+#
+
+
+######################################################################
+# Acknowledgement
+# ---------------
+#
+# Thanks to `Vineel Pratap <vineelkpratap@meta.com>`__ and `Zhaoheng
+# Ni <zni@meta.com>`__ for working on the forced aligner API, and
+# `Moto Hira <moto@meta.com>`__ for providing alignment merging and
+# visualization utilities.
+#
diff --git a/examples/tutorials/forced_alignment_tutorial.py b/examples/tutorials/forced_alignment_tutorial.py
index 2a09e342f7..ab98908559 100644
--- a/examples/tutorials/forced_alignment_tutorial.py
+++ b/examples/tutorials/forced_alignment_tutorial.py
@@ -9,6 +9,17 @@
 `CTC-Segmentation of Large Corpora for German End-to-end Speech
 Recognition <https://arxiv.org/abs/2007.09127>`__.
 
+.. note::
+
+   The implementation in this tutorial is simplified for
+   educational purpose.
+
+   If you are looking to align your corpus, we recommend to use
+   :py:func:`torchaudio.functional.forced_align`, which is more
+   accurate and faster.
+
+   Please refer to `this tutorial <./ctc_forced_alignment_api_tutorial.html>`__
+   for the detail of :py:func:`~torchaudio.functional.forced_align`.
 """
 
 import torch
@@ -138,7 +149,9 @@
 # [`distill.pub <https://distill.pub/2017/ctc/>`__])
 #
 
-transcript = "I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT"
+
+# We enclose the transcript with space tokens, which represent SOS and EOS.
+transcript = "|I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|"
 dictionary = {c: i for i, c in enumerate(labels)}
 
 tokens = [dictionary[c] for c in transcript]
@@ -149,21 +162,17 @@ def get_trellis(emission, tokens, blank_id=0):
     num_frame = emission.size(0)
     num_tokens = len(tokens)
 
-    # Trellis has extra diemsions for both time axis and tokens.
-    # The extra dim for tokens represents <SoS> (start-of-sentence)
-    # The extra dim for time axis is for simplification of the code.
-    trellis = torch.empty((num_frame + 1, num_tokens + 1))
-    trellis[0, 0] = 0
-    trellis[1:, 0] = torch.cumsum(emission[:, blank_id], 0)
-    trellis[0, -num_tokens:] = -float("inf")
-    trellis[-num_tokens:, 0] = float("inf")
+    trellis = torch.zeros((num_frame, num_tokens))
+    trellis[1:, 0] = torch.cumsum(emission[1:, blank_id], 0)
+    trellis[0, 1:] = -float("inf")
+    trellis[-num_tokens + 1 :, 0] = float("inf")
 
-    for t in range(num_frame):
+    for t in range(num_frame - 1):
         trellis[t + 1, 1:] = torch.maximum(
             # Score for staying at the same token
             trellis[t, 1:] + emission[t, blank_id],
             # Score for changing to the next token
-            trellis[t, :-1] + emission[t, tokens],
+            trellis[t, :-1] + emission[t, tokens[1:]],
         )
     return trellis
 
@@ -173,8 +182,9 @@ def get_trellis(emission, tokens, blank_id=0):
 ################################################################################
 # Visualization
 ################################################################################
-plt.imshow(trellis[1:, 1:].T, origin="lower")
+plt.imshow(trellis.T, origin="lower")
 plt.annotate("- Inf", (trellis.size(1) / 5, trellis.size(1) / 1.5))
+plt.annotate("+ Inf", (trellis.size(0) - trellis.size(1) / 5, trellis.size(1) / 3))
 plt.colorbar()
 plt.show()
 
@@ -214,38 +224,38 @@ class Point:
 
 
 def backtrack(trellis, emission, tokens, blank_id=0):
-    # Note:
-    # j and t are indices for trellis, which has extra dimensions
-    # for time and tokens at the beginning.
-    # When referring to time frame index `T` in trellis,
-    # the corresponding index in emission is `T-1`.
-    # Similarly, when referring to token index `J` in trellis,
-    # the corresponding index in transcript is `J-1`.
-    j = trellis.size(1) - 1
-    t_start = torch.argmax(trellis[:, j]).item()
-
-    path = []
-    for t in range(t_start, 0, -1):
+    t, j = trellis.size(0) - 1, trellis.size(1) - 1
+
+    path = [Point(j, t, emission[t, blank_id].exp().item())]
+    while j > 0:
+        # Should not happen but just in case
+        assert t > 0
+
         # 1. Figure out if the current position was stay or change
-        # Note (again):
-        # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
-        # Score for token staying the same from time frame J-1 to T.
-        stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
-        # Score for token changing from C-1 at T-1 to J at T.
-        changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-
-        # 2. Store the path with frame-wise probability.
-        prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
-        # Return token index and time index in non-trellis coordinate.
-        path.append(Point(j - 1, t - 1, prob))
-
-        # 3. Update the token
+        # Frame-wise score of stay vs change
+        p_stay = emission[t - 1, blank_id]
+        p_change = emission[t - 1, tokens[j]]
+
+        # Context-aware score for stay vs change
+        stayed = trellis[t - 1, j] + p_stay
+        changed = trellis[t - 1, j - 1] + p_change
+
+        # Update position
+        t -= 1
         if changed > stayed:
             j -= 1
-            if j == 0:
-                break
-    else:
-        raise ValueError("Failed to align")
+
+        # Store the path with frame-wise probability.
+        prob = (p_change if changed > stayed else p_stay).exp().item()
+        path.append(Point(j, t, prob))
+
+    # Now j == 0, which means, it reached the SoS.
+    # Fill up the rest for the sake of visualization
+    while t > 0:
+        prob = emission[t - 1, blank_id].exp().item()
+        path.append(Point(j, t - 1, prob))
+        t -= 1
+
     return path[::-1]
 
 
@@ -262,7 +272,7 @@ def plot_trellis_with_path(trellis, path):
     trellis_with_path = trellis.clone()
     for _, p in enumerate(path):
         trellis_with_path[p.time_index, p.token_index] = float("nan")
-    plt.imshow(trellis_with_path[1:, 1:].T, origin="lower")
+    plt.imshow(trellis_with_path.T, origin="lower")
 
 
 plot_trellis_with_path(trellis, path)
@@ -326,7 +336,7 @@ def plot_trellis_with_segments(trellis, segments, transcript):
     trellis_with_path = trellis.clone()
     for i, seg in enumerate(segments):
         if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
+            trellis_with_path[seg.start : seg.end, i] = float("nan")
 
     fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
     ax1.set_title("Path, label and probability for each label")
@@ -335,8 +345,8 @@ def plot_trellis_with_segments(trellis, segments, transcript):
 
     for i, seg in enumerate(segments):
         if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start + 0.7, i + 0.3), weight="bold")
-            ax1.annotate(f"{seg.score:.2f}", (seg.start - 0.3, i + 4.3))
+            ax1.annotate(seg.label, (seg.start, i - 0.7), weight="bold")
+            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3))
 
     ax2.set_title("Label probability with and without repetation")
     xs, hs, ws = [], [], []
@@ -405,11 +415,11 @@ def plot_alignments(trellis, segments, word_segments, waveform):
     trellis_with_path = trellis.clone()
     for i, seg in enumerate(segments):
         if seg.label != "|":
-            trellis_with_path[seg.start + 1 : seg.end + 1, i + 1] = float("nan")
+            trellis_with_path[seg.start : seg.end, i] = float("nan")
 
     fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9.5))
 
-    ax1.imshow(trellis_with_path[1:, 1:].T, origin="lower")
+    ax1.imshow(trellis_with_path.T, origin="lower")
     ax1.set_xticks([])
     ax1.set_yticks([])
 
@@ -419,11 +429,11 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 
     for i, seg in enumerate(segments):
         if seg.label != "|":
-            ax1.annotate(seg.label, (seg.start, i + 0.3))
-            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 4), fontsize=8)
+            ax1.annotate(seg.label, (seg.start, i - 0.7))
+            ax1.annotate(f"{seg.score:.2f}", (seg.start, i + 3), fontsize=8)
 
     # The original waveform
-    ratio = waveform.size(0) / (trellis.size(0) - 1)
+    ratio = waveform.size(0) / trellis.size(0)
     ax2.plot(waveform)
     for word in word_segments:
         x0 = ratio * word.start
@@ -450,14 +460,17 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 )
 plt.show()
 
+
 ################################################################################
+# Audio Samples
+# -------------
 #
 
 # A trick to embed the resulting audio to the generated file.
 # `IPython.display.Audio` has to be the last call in a cell,
 # and there should be only one call par cell.
 def display_segment(i):
-    ratio = waveform.size(1) / (trellis.size(0) - 1)
+    ratio = waveform.size(1) / trellis.size(0)
     word = word_segments[i]
     x0 = int(ratio * word.start)
     x1 = int(ratio * word.end)
diff --git a/examples/tutorials/hybrid_demucs_tutorial.py b/examples/tutorials/hybrid_demucs_tutorial.py
index f0c5778940..8be6c9903b 100644
--- a/examples/tutorials/hybrid_demucs_tutorial.py
+++ b/examples/tutorials/hybrid_demucs_tutorial.py
@@ -45,6 +45,8 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
+import matplotlib.pyplot as plt
+
 ######################################################################
 # In addition to ``torchaudio``, ``mir_eval`` is required to perform
 # signal-to-distortion ratio (SDR) calculations. To install ``mir_eval``
@@ -52,30 +54,9 @@
 #
 
 from IPython.display import Audio
+from mir_eval import separation
+from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
 from torchaudio.utils import download_asset
-import matplotlib.pyplot as plt
-
-try:
-    from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB_PLUS
-    from mir_eval import separation
-
-except ModuleNotFoundError:
-    try:
-        import google.colab
-
-        print(
-            """
-            To enable running this notebook in Google Colab, install nightly
-            torch and torchaudio builds by adding the following code block to the top
-            of the notebook before running it:
-            !pip3 uninstall -y torch torchvision torchaudio
-            !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-            !pip3 install mir_eval
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
 
 ######################################################################
 # 3. Construct the pipeline
@@ -130,11 +111,11 @@
 
 
 def separate_sources(
-        model,
-        mix,
-        segment=10.,
-        overlap=0.1,
-        device=None,
+    model,
+    mix,
+    segment=10.0,
+    overlap=0.1,
+    device=None,
 ):
     """
     Apply model to a given mixture. Use fade, and add segments together in order to add model segment by segment.
@@ -157,7 +138,7 @@ def separate_sources(
     start = 0
     end = chunk_len
     overlap_frames = overlap * sample_rate
-    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape='linear')
+    fade = Fade(fade_in_len=0, fade_out_len=int(overlap_frames), fade_shape="linear")
 
     final = torch.zeros(batch, len(model.sources), channels, length, device=device)
 
@@ -265,12 +246,13 @@ def plot_spectrogram(stft, title="Spectrogram"):
 # scores.
 #
 
+
 def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor, source: str):
-    print("SDR score is:",
-          separation.bss_eval_sources(
-              original_source.detach().numpy(),
-              predicted_source.detach().numpy())[0].mean())
-    plot_spectrogram(stft(predicted_source)[0], f'Spectrogram {source}')
+    print(
+        "SDR score is:",
+        separation.bss_eval_sources(original_source.detach().numpy(), predicted_source.detach().numpy())[0].mean(),
+    )
+    plot_spectrogram(stft(predicted_source)[0], f"Spectrogram {source}")
     return Audio(predicted_source, rate=sample_rate)
 
 
@@ -285,19 +267,19 @@ def output_results(original_source: torch.Tensor, predicted_source: torch.Tensor
 vocals_original = download_asset("tutorial-assets/hdemucs_vocals_segment.wav")
 other_original = download_asset("tutorial-assets/hdemucs_other_segment.wav")
 
-drums_spec = audios["drums"][:, frame_start: frame_end].cpu()
+drums_spec = audios["drums"][:, frame_start:frame_end].cpu()
 drums, sample_rate = torchaudio.load(drums_original)
 
-bass_spec = audios["bass"][:, frame_start: frame_end].cpu()
+bass_spec = audios["bass"][:, frame_start:frame_end].cpu()
 bass, sample_rate = torchaudio.load(bass_original)
 
-vocals_spec = audios["vocals"][:, frame_start: frame_end].cpu()
+vocals_spec = audios["vocals"][:, frame_start:frame_end].cpu()
 vocals, sample_rate = torchaudio.load(vocals_original)
 
-other_spec = audios["other"][:, frame_start: frame_end].cpu()
+other_spec = audios["other"][:, frame_start:frame_end].cpu()
 other, sample_rate = torchaudio.load(other_original)
 
-mix_spec = mixture[:, frame_start: frame_end].cpu()
+mix_spec = mixture[:, frame_start:frame_end].cpu()
 
 
 ######################################################################
diff --git a/examples/tutorials/mvdr_tutorial.py b/examples/tutorials/mvdr_tutorial.py
index 869dd59535..7c9013d180 100644
--- a/examples/tutorials/mvdr_tutorial.py
+++ b/examples/tutorials/mvdr_tutorial.py
@@ -37,6 +37,10 @@
 print(torchaudio.__version__)
 
 
+import matplotlib.pyplot as plt
+import mir_eval
+from IPython.display import Audio
+
 ######################################################################
 # 2. Preparation
 # --------------
@@ -59,10 +63,6 @@
 
 from pesq import pesq
 from pystoi import stoi
-import mir_eval
-
-import matplotlib.pyplot as plt
-from IPython.display import Audio
 from torchaudio.utils import download_asset
 
 ######################################################################
diff --git a/examples/tutorials/online_asr_tutorial.py b/examples/tutorials/online_asr_tutorial.py
index 45c65b41c9..51fa292389 100644
--- a/examples/tutorials/online_asr_tutorial.py
+++ b/examples/tutorials/online_asr_tutorial.py
@@ -13,11 +13,11 @@
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=5, <6) and SentencePiece.
+#    This tutorial requires FFmpeg libraries (>=4.1, <4.4) and SentencePiece.
 #
 #    There are multiple ways to install FFmpeg libraries.
 #    If you are using Anaconda Python distribution,
-#    ``conda install -c conda-forge 'ffmpeg<6'`` will install
+#    ``conda install 'ffmpeg<4.4'`` will install
 #    the required FFmpeg libraries.
 #
 #    You can install SentencePiece by running ``pip install sentencepiece``.
@@ -45,30 +45,9 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
-######################################################################
-#
 import IPython
 import matplotlib.pyplot as plt
-
-try:
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code block:
-
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
-
+from torchaudio.io import StreamReader
 
 ######################################################################
 # 3. Construct the pipeline
@@ -202,11 +181,11 @@ def _plot(feats, num_iter, unit=25):
     fig, axes = plt.subplots(num_plots, 1)
     t0 = 0
     for i, ax in enumerate(axes):
-        feats_ = feats[i*unit:(i+1)*unit]
+        feats_ = feats[i * unit : (i + 1) * unit]
         t1 = t0 + segment_length / sample_rate * len(feats_)
         feats_ = torch.cat([f[2:-2] for f in feats_])  # remove boundary effect and overlap
         ax.imshow(feats_.T, extent=[t0, t1, 0, 1], aspect="auto", origin="lower")
-        ax.tick_params(which='both', left=False, labelleft=False)
+        ax.tick_params(which="both", left=False, labelleft=False)
         ax.set_xlim(t0, t0 + unit_dur)
         t0 = t1
     fig.suptitle("MelSpectrogram Feature")
@@ -222,9 +201,9 @@ def run_inference(num_iter=100):
         segment = cacher(chunk[:, 0])
         features, length = feature_extractor(segment)
         hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
-        hypothesis = hypos[0]
-        transcript = token_processor(hypothesis[0], lstrip=False)
-        print(transcript, end="", flush=True)
+        hypothesis = hypos
+        transcript = token_processor(hypos[0][0], lstrip=False)
+        print(transcript, end="\r", flush=True)
 
         chunks.append(chunk)
         feats.append(features)
diff --git a/examples/tutorials/oscillator_tutorial.py b/examples/tutorials/oscillator_tutorial.py
index 25f9a6e7c2..b47c7cff4e 100644
--- a/examples/tutorials/oscillator_tutorial.py
+++ b/examples/tutorials/oscillator_tutorial.py
@@ -28,19 +28,18 @@
 #
 
 try:
-    from torchaudio.prototype.functional import (
-        oscillator_bank,
-        adsr_envelope,
-    )
+    from torchaudio.prototype.functional import adsr_envelope, oscillator_bank
 except ModuleNotFoundError:
     print(
         "Failed to import prototype DSP features. "
         "Please install torchaudio nightly builds. "
         "Please refer to https://pytorch.org/get-started/locally "
-        "for instructions to install a nightly build.")
+        "for instructions to install a nightly build."
+    )
     raise
 
 import math
+
 import matplotlib.pyplot as plt
 from IPython.display import Audio
 
@@ -93,7 +92,7 @@
 # the rest of the tutorial.
 #
 
-F0 = 344.  # fundamental frequency
+F0 = 344.0  # fundamental frequency
 DURATION = 1.1  # [seconds]
 SAMPLE_RATE = 16_000  # [Hz]
 
@@ -102,26 +101,19 @@
 ######################################################################
 #
 
+
 def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.3):
     t = torch.arange(waveform.size(0)) / sample_rate
 
     fig, axes = plt.subplots(4, 1, sharex=True)
     axes[0].plot(t, freq)
-    axes[0].set(
-        title=f"Oscillator bank (bank size: {amp.size(-1)})",
-        ylabel="Frequency [Hz]",
-        ylim=[-0.03, None])
+    axes[0].set(title=f"Oscillator bank (bank size: {amp.size(-1)})", ylabel="Frequency [Hz]", ylim=[-0.03, None])
     axes[1].plot(t, amp)
-    axes[1].set(
-        ylabel="Amplitude",
-        ylim=[-0.03 if torch.all(amp >= 0.0) else None, None])
+    axes[1].set(ylabel="Amplitude", ylim=[-0.03 if torch.all(amp >= 0.0) else None, None])
     axes[2].plot(t, waveform)
     axes[2].set(ylabel="Waveform")
     axes[3].specgram(waveform, Fs=sample_rate)
-    axes[3].set(
-        ylabel="Spectrogram",
-        xlabel="Time [s]",
-        xlim=[-0.01, t[-1] + 0.01])
+    axes[3].set(ylabel="Spectrogram", xlabel="Time [s]", xlim=[-0.01, t[-1] + 0.01])
 
     for i in range(4):
         axes[i].grid(True)
@@ -147,7 +139,7 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.3):
 
 waveform = oscillator_bank(freq, amp, sample_rate=SAMPLE_RATE)
 
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 ######################################################################
 # Combining multiple sine waves
@@ -166,7 +158,7 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.3):
 
 waveform = oscillator_bank(freq, amp, sample_rate=SAMPLE_RATE)
 
-show(freq, amp, waveform, SAMPLE_RATE, zoom=(1/F0, 3/F0))
+show(freq, amp, waveform, SAMPLE_RATE, zoom=(1 / F0, 3 / F0))
 
 
 ######################################################################
@@ -279,7 +271,8 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.3):
         adsr_envelope(unit, attack=0.01, hold=0.125, decay=0.12, sustain=0.05, release=0),
         adsr_envelope(unit, attack=0.01, hold=0.25, decay=0.08, sustain=0, release=0),
     ),
-    dim=-1)
+    dim=-1,
+)
 amp = amp.repeat(repeat, 1) / 2
 
 bass = oscillator_bank(freq, amp, sample_rate=SAMPLE_RATE)
@@ -316,7 +309,7 @@ def show(freq, amp, waveform, sample_rate, zoom=None, vol=0.3):
 # ~~~~~
 #
 
-env = adsr_envelope(NUM_FRAMES * 6, attack=0.98, decay=0., sustain=1, release=0.02)
+env = adsr_envelope(NUM_FRAMES * 6, attack=0.98, decay=0.0, sustain=1, release=0.02)
 
 tones = [
     484.90,  # B4
diff --git a/examples/tutorials/squim_tutorial.py b/examples/tutorials/squim_tutorial.py
index 66d8c6e19d..5314915554 100644
--- a/examples/tutorials/squim_tutorial.py
+++ b/examples/tutorials/squim_tutorial.py
@@ -78,12 +78,11 @@
 #
 
 try:
-    from torchaudio.prototype.pipelines import SQUIM_OBJECTIVE
-    from torchaudio.prototype.pipelines import SQUIM_SUBJECTIVE
     from pesq import pesq
     from pystoi import stoi
+    from torchaudio.prototype.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
 except ImportError:
-    import google.colab
+    import google.colab  # noqa: F401
 
     print(
         """
@@ -98,14 +97,15 @@
     )
 
 
+import matplotlib.pyplot as plt
+
 ######################################################################
 #
 #
 
 import torchaudio.functional as F
-from torchaudio.utils import download_asset
 from IPython.display import Audio
-import matplotlib.pyplot as plt
+from torchaudio.utils import download_asset
 
 
 def si_snr(estimate, reference, epsilon=1e-8):
diff --git a/examples/tutorials/streamreader_advanced_tutorial.py b/examples/tutorials/streamreader_advanced_tutorial.py
index f1798897ef..a6f1673f96 100644
--- a/examples/tutorials/streamreader_advanced_tutorial.py
+++ b/examples/tutorials/streamreader_advanced_tutorial.py
@@ -20,35 +20,15 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
-######################################################################
-#
-
-try:
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code:
-
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
-
 import IPython
 import matplotlib.pyplot as plt
+from torchaudio.io import StreamReader
 
 base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
 AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
 VIDEO_URL = f"{base_url}/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4.mp4"
 
+
 ######################################################################
 # Audio / Video device input
 # --------------------------
diff --git a/examples/tutorials/streamreader_basic_tutorial.py b/examples/tutorials/streamreader_basic_tutorial.py
index a1f80cf372..b18cefbef8 100644
--- a/examples/tutorials/streamreader_basic_tutorial.py
+++ b/examples/tutorials/streamreader_basic_tutorial.py
@@ -14,11 +14,11 @@
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=5.0, <6).
+#    This tutorial requires FFmpeg libraries (>=4.1, <4.4).
 #
 #    There are multiple ways to install FFmpeg libraries.
 #    If you are using Anaconda Python distribution,
-#    ``conda install -c conda-forge 'ffmpeg<6'`` will install
+#    ``conda install -c anaconda 'ffmpeg<4.4'`` will install
 #    the required libraries.
 #
 
@@ -65,29 +65,8 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
-######################################################################
-#
-
-try:
-    from torchaudio.io import StreamReader
-except ModuleNotFoundError:
-    try:
-        import google.colab
-
-        print(
-            """
-            To enable running this notebook in Google Colab, install the requisite
-            third party libraries by running the following code:
-
-            !add-apt-repository -y ppa:savoury1/ffmpeg4
-            !apt-get -qq install -y ffmpeg
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
-
 import matplotlib.pyplot as plt
+from torchaudio.io import StreamReader
 
 base_url = "https://download.pytorch.org/torchaudio/tutorial-assets"
 AUDIO_URL = f"{base_url}/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
diff --git a/examples/tutorials/streamwriter_advanced.py b/examples/tutorials/streamwriter_advanced.py
index 69bb76eb63..5c306820bc 100644
--- a/examples/tutorials/streamwriter_advanced.py
+++ b/examples/tutorials/streamwriter_advanced.py
@@ -23,14 +23,17 @@
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=5.0, <6).
+#    This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
+#
+#    To install torchaudio nightly build, please refer to
+#    https://pytorch.org/get-started/locally/ .
+#
 #
 #    There are multiple ways to install FFmpeg libraries.
 #    If you are using Anaconda Python distribution,
-#    ``conda install -c conda-forge 'ffmpeg<6'`` will install
-#    the required libraries.
-#    This distribution, however, does not have SDL plugin, so
-#    it cannot play video.
+#    ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries,
+#    however, this distribution does not have SDL plugin, so it cannot play
+#    video.
 #
 
 ######################################################################
@@ -71,7 +74,9 @@
 from torchaudio.utils import download_asset
 
 AUDIO_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
-VIDEO_PATH = download_asset("tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4")
+VIDEO_PATH = download_asset(
+    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
+)
 
 ######################################################################
 #
@@ -137,7 +142,7 @@
 # Write audio to the device
 with s.open():
     for i in range(0, num_frames, 256):
-        s.write_audio_chunk(0, waveform[i:i+256])
+        s.write_audio_chunk(0, waveform[i : i + 256])
 
 ######################################################################
 #
@@ -183,8 +188,12 @@
 # a background thread and give chunks
 
 running = True
+
+
 def video_streamer(path, frames_per_chunk):
-    import queue, threading
+    import queue
+    import threading
+
     from torchaudio.io import StreamReader
 
     q = queue.Queue()
@@ -193,9 +202,9 @@ def video_streamer(path, frames_per_chunk):
     def _streamer():
         streamer = StreamReader(path)
         streamer.add_basic_video_stream(
-            frames_per_chunk, format="rgb24",
-            frame_rate=frame_rate, width=width, height=height)
-        for (chunk_, ) in streamer.stream():
+            frames_per_chunk, format="rgb24", frame_rate=frame_rate, width=width, height=height
+        )
+        for (chunk_,) in streamer.stream():
             q.put(chunk_)
             if not running:
                 break
diff --git a/examples/tutorials/streamwriter_basic_tutorial.py b/examples/tutorials/streamwriter_basic_tutorial.py
index 99f89d1a7b..ed93655990 100644
--- a/examples/tutorials/streamwriter_basic_tutorial.py
+++ b/examples/tutorials/streamwriter_basic_tutorial.py
@@ -13,12 +13,14 @@
 #
 # .. note::
 #
-#    This tutorial requires FFmpeg libraries (>=5.0, <6).
+#    This tutorial requires torchaudio nightly build and FFmpeg libraries (>=4.1, <4.4).
+#
+#    To install torchaudio nightly build, please refer to
+#    https://pytorch.org/get-started/locally/ .
 #
 #    There are multiple ways to install FFmpeg libraries.
 #    If you are using Anaconda Python distribution,
-#    ``conda install -c conda-forge 'ffmpeg<6'`` will install
-#    the required libraries.
+#    ``conda install 'ffmpeg<4.4'`` will install the required FFmpeg libraries.
 #
 
 ######################################################################
@@ -49,27 +51,7 @@
 print(torch.__version__)
 print(torchaudio.__version__)
 
-######################################################################
-#
-
-try:
-    from torchaudio.io import StreamWriter
-except ImportError:
-    try:
-        import google.colab
-
-        print(
-            """
-            To enable running this notebook in Google Colab, install nightly
-            torch and torchaudio builds by adding the following code block to the top
-            of the notebook before running it:
-            !pip3 uninstall -y torch torchvision torchaudio
-            !pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-            """
-        )
-    except ModuleNotFoundError:
-        pass
-    raise
+from torchaudio.io import StreamWriter
 
 print("FFmpeg library versions")
 for k, v in torchaudio.utils.ffmpeg_utils.get_versions().items():
@@ -82,9 +64,10 @@
 import os
 import tempfile
 
-from torchaudio.utils import download_asset
 from IPython.display import Audio, Video
 
+from torchaudio.utils import download_asset
+
 SAMPLE_PATH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
 WAVEFORM, SAMPLE_RATE = torchaudio.load(SAMPLE_PATH, channels_first=False)
 NUM_FRAMES, NUM_CHANNELS = WAVEFORM.shape
@@ -501,6 +484,8 @@ def get_path(filename):
 
 assert bytes1 == bytes2
 
+import matplotlib.pyplot as plt
+
 ######################################################################
 #
 # Example - Spectrum Visualizer
@@ -515,7 +500,6 @@ def get_path(filename):
 # then use StreamWriter to convert them to video with the original audio.
 
 import torchaudio.transforms as T
-import matplotlib.pyplot as plt
 
 ######################################################################
 #
@@ -546,7 +530,7 @@ def get_path(filename):
 #
 
 spec_db = T.AmplitudeToDB(stype="magnitude", top_db=80)(specs.T)
-_ = plt.imshow(spec_db, aspect="auto", origin='lower')
+_ = plt.imshow(spec_db, aspect="auto", origin="lower")
 
 ######################################################################
 #
@@ -567,21 +551,27 @@ def get_path(filename):
 def _plot(data):
     ax.clear()
     x = list(range(len(data)))
-    R, G, B = 238/255, 76/255, 44/255
+    R, G, B = 238 / 255, 76 / 255, 44 / 255
     for coeff, alpha in [(0.8, 0.7), (1, 1)]:
-        d = data ** coeff
+        d = data**coeff
         ax.fill_between(x, d, -d, color=[R, G, B, alpha])
     xlim = n_fft // 2 + 1
     ax.set_xlim([-1, n_fft // 2 + 1])
     ax.set_ylim([-1, 1])
     ax.text(
-        xlim, 0.95,
+        xlim,
+        0.95,
         f"Created with TorchAudio\n{torchaudio.__version__}",
-        color="white", ha="right", va="top", backgroundcolor="black")
+        color="white",
+        ha="right",
+        va="top",
+        backgroundcolor="black",
+    )
     fig.canvas.draw()
     frame = torch.frombuffer(fig.canvas.tostring_rgb(), dtype=torch.uint8)
     return frame.reshape(nrows, ncols, 3).permute(2, 0, 1)
 
+
 # sphinx_gallery_defer_figures
 
 ######################################################################
@@ -602,10 +592,10 @@ def _plot(data):
     # Process by second
     for t in range(0, NUM_FRAMES, SAMPLE_RATE):
         # Write audio chunk
-        s.write_audio_chunk(0, WAVEFORM[t:t + SAMPLE_RATE, :])
+        s.write_audio_chunk(0, WAVEFORM[t : t + SAMPLE_RATE, :])
 
         # write 1 second of video chunk
-        frames = [_plot(spec) for spec in specs[i:i+frame_rate]]
+        frames = [_plot(spec) for spec in specs[i : i + frame_rate]]
         if frames:
             s.write_video_chunk(1, torch.stack(frames))
         i += frame_rate
diff --git a/examples/tutorials/subtractive_synthesis_tutorial.py b/examples/tutorials/subtractive_synthesis_tutorial.py
index 3d362cd29d..6af24a0dc9 100644
--- a/examples/tutorials/subtractive_synthesis_tutorial.py
+++ b/examples/tutorials/subtractive_synthesis_tutorial.py
@@ -33,17 +33,14 @@
 #
 
 try:
-    from torchaudio.prototype.functional import (
-        sinc_impulse_response,
-        frequency_impulse_response,
-        filter_waveform,
-    )
+    from torchaudio.prototype.functional import filter_waveform, frequency_impulse_response, sinc_impulse_response
 except ModuleNotFoundError:
     print(
         "Failed to import prototype DSP features. "
         "Please install torchaudio nightly builds. "
         "Please refer to https://pytorch.org/get-started/locally "
-        "for instructions to install a nightly build.")
+        "for instructions to install a nightly build."
+    )
     raise
 
 import matplotlib.pyplot as plt
@@ -67,7 +64,7 @@
 duration = 4
 num_frames = int(duration * SAMPLE_RATE)
 
-noise = torch.rand((num_frames, )) - 0.5
+noise = torch.rand((num_frames,)) - 0.5
 
 
 ######################################################################
@@ -80,6 +77,7 @@ def plot_input():
     axes[1].specgram(noise, Fs=SAMPLE_RATE)
     Audio(noise, rate=SAMPLE_RATE)
 
+
 plot_input()
 
 ######################################################################
@@ -101,7 +99,7 @@ def plot_input():
 window_size = 2049
 
 f_cutoff = torch.linspace(0.0, 0.8, num_filters)
-kernel = sinc_impulse_response(f_cutoff , window_size)
+kernel = sinc_impulse_response(f_cutoff, window_size)
 
 ######################################################################
 #
@@ -116,6 +114,7 @@ def plot_input():
 # Let's look at the spectrogram of the resulting audio and listen to it.
 #
 
+
 def plot_sinc_ir(waveform, cutoff, sample_rate, vol=0.2):
     num_frames = waveform.size(0)
     duration = num_frames / sample_rate
@@ -160,7 +159,7 @@ def plot_sinc_ir(waveform, cutoff, sample_rate, vol=0.2):
 ######################################################################
 #
 
-kernel = sinc_impulse_response(f_cutoff , window_size)
+kernel = sinc_impulse_response(f_cutoff, window_size)
 filtered = filter_waveform(noise, kernel)
 
 ######################################################################
@@ -182,7 +181,7 @@ def plot_sinc_ir(waveform, cutoff, sample_rate, vol=0.2):
 ######################################################################
 #
 
-kernel = sinc_impulse_response(f_cutoff , window_size)
+kernel = sinc_impulse_response(f_cutoff, window_size)
 filtered = filter_waveform(noise, kernel)
 
 ######################################################################
@@ -200,13 +199,14 @@ def plot_sinc_ir(waveform, cutoff, sample_rate, vol=0.2):
 #
 
 
-magnitudes = torch.sin(torch.linspace(0, 10, 64))**4.0
+magnitudes = torch.sin(torch.linspace(0, 10, 64)) ** 4.0
 kernel = frequency_impulse_response(magnitudes)
 filtered = filter_waveform(noise, kernel.unsqueeze(0))
 
 ######################################################################
 #
 
+
 def plot_waveform(magnitudes, filtered, sample_rate):
     nyquist = sample_rate / 2
     num_samples = filtered.size(-1)
@@ -218,8 +218,10 @@ def plot_waveform(magnitudes, filtered, sample_rate):
     offsets = duration * interval
     # Select N magnitudes for overlays
     mags = torch.stack(
-        [magnitudes for _ in range(N)] if magnitudes.ndim == 1 else
-        [magnitudes[int(i * magnitudes.size(0))] for i in interval])
+        [magnitudes for _ in range(N)]
+        if magnitudes.ndim == 1
+        else [magnitudes[int(i * magnitudes.size(0))] for i in interval]
+    )
     mag_x = offsets.unsqueeze(-1) + 0.1 * mags
     mag_y = torch.linspace(0, nyquist, magnitudes.size(-1)).tile((N, 1))
 
@@ -229,6 +231,7 @@ def plot_waveform(magnitudes, filtered, sample_rate):
     ax.specgram(filtered, Fs=sample_rate)
     return Audio(filtered, rate=sample_rate)
 
+
 ######################################################################
 #
 plot_waveform(magnitudes, filtered, SAMPLE_RATE)
@@ -237,8 +240,7 @@ def plot_waveform(magnitudes, filtered, sample_rate):
 #
 # It is also possible to make a non-stationary filter.
 
-magnitudes = torch.stack(
-    [torch.linspace(0.0, w, 1000) for w in torch.linspace(4.0, 40.0, 250)])
+magnitudes = torch.stack([torch.linspace(0.0, w, 1000) for w in torch.linspace(4.0, 40.0, 250)])
 magnitudes = torch.sin(magnitudes) ** 4.0
 
 ######################################################################
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
deleted file mode 100755
index b3792b3573..0000000000
--- a/packaging/build_conda.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-set -ex
-
-echo FFMPEG_ROOT=${FFMPEG_ROOT}
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE="conda"
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_constraint
-setup_visual_studio_constraint
-
-export CUDATOOLKIT_CHANNEL="nvidia"
-# NOTE: There are some dependencies that are not available for macOS on Python 3.10 without conda-forge
-if [[ ${OSTYPE} =~ darwin* ]] && [[ ${PYTHON_VERSION} = "3.10" ]]; then
-    CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c conda-forge"
-fi
-
-conda build -c defaults -c $CUDATOOLKIT_CHANNEL ${CONDA_CHANNEL_FLAGS:-}  --no-anaconda-upload --no-test --python "$PYTHON_VERSION"  packaging/torchaudio
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
deleted file mode 100755
index a19fd9cdcd..0000000000
--- a/packaging/build_wheel.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -ex
-
-echo FFMPEG_ROOT=${FFMPEG_ROOT}
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE="wheel"
-setup_env
-setup_wheel_python
-pip_install numpy future cmake ninja
-setup_pip_pytorch_version
-python setup.py clean
-if [[ "$OSTYPE" == "msys" ]]; then
-    python_tag="$(echo "cp$PYTHON_VERSION" | tr -d '.')"
-    "$script_dir/vc_env_helper.bat" python setup.py bdist_wheel --plat-name win_amd64 --python-tag $python_tag
-else
-    python setup.py bdist_wheel
-fi
diff --git a/packaging/ffmpeg/build.sh b/packaging/ffmpeg/build.sh
index 233bd55d2b..7549e60757 100755
--- a/packaging/ffmpeg/build.sh
+++ b/packaging/ffmpeg/build.sh
@@ -20,6 +20,9 @@ args=""
 if [[ "$OSTYPE" == "msys" ]]; then
    args="--toolchain=msvc"
 fi
+ffmpeg_version="${FFMPEG_VERSION:-4.1.8}"
+
+archive="https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n${ffmpeg_version}.tar.gz"
 
 build_dir=$(mktemp -d -t ffmpeg-build.XXXXXXXXXX)
 cleanup() {
@@ -32,7 +35,7 @@ cd "${build_dir}"
 # NOTE:
 # When changing the version of FFmpeg, update the README so that the link to the source points
 # the same version.
-curl -LsS -o ffmpeg.tar.gz https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n5.0.3.tar.gz
+curl -LsS -o ffmpeg.tar.gz "${archive}"
 tar -xf ffmpeg.tar.gz --strip-components 1
 ./configure \
     --prefix="${prefix}" \
@@ -72,11 +75,29 @@ ls ${prefix}/*
 # macOS: Fix rpath so that the libraries are searched dynamically in user environment.
 # In Linux, this is handled by `--enable-rpath` flag.
 if [[ "$(uname)" == Darwin ]]; then
-    avcodec=libavcodec.59
-    avdevice=libavdevice.59
-    avfilter=libavfilter.8
-    avformat=libavformat.59
-    avutil=libavutil.57
+    major_ver=${ffmpeg_version:0:1}
+    if [[ ${major_ver} == 4 ]]; then
+        avutil=libavutil.56
+        avcodec=libavcodec.58
+        avformat=libavformat.58
+        avdevice=libavdevice.58
+        avfilter=libavfilter.7
+    elif [[ ${major_ver} == 5 ]]; then
+        avutil=libavutil.57
+        avcodec=libavcodec.59
+        avformat=libavformat.59
+        avdevice=libavdevice.59
+        avfilter=libavfilter.8
+    elif [[ ${major_ver} == 6 ]]; then
+        avutil=libavutil.58
+        avcodec=libavcodec.60
+        avformat=libavformat.60
+        avdevice=libavdevice.60
+        avfilter=libavfilter.9
+    else
+        printf "Error: unexpected FFmpeg major version: %s\n"  ${major_ver}
+        exit 1;
+    fi
 
     otool="/usr/bin/otool"
     # NOTE: miniconda has a version of otool and install_name_tool installed and we want
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
deleted file mode 100644
index 150305d376..0000000000
--- a/packaging/pkg_helpers.bash
+++ /dev/null
@@ -1,285 +0,0 @@
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   USE_CUDA (respected by torchaudio setup.py)
-#   NVCC_FLAGS (respected by torchaudio setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR="cpu/"
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu121)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.1"
-      else
-        export CUDA_HOME=/usr/local/cuda-12.1/
-      fi
-      export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
-      ;;
-    cu118)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.8/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0"
-      ;;
-    cu117)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.7/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu116)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.6/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    rocm*)
-      export USE_ROCM=1
-      ;;
-    cpu)
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    if [[ "$OSTYPE" == "msys" ]]; then
-      export PATH="$CUDA_HOME\\bin:$PATH"
-    else
-      # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-      export PATH="$CUDA_HOME/bin:$PATH"
-    fi
-    export USE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Or retrieve it from the version.txt
-# Usage: setup_build_version
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    if [[ -z "$1" ]]; then
-      setup_base_build_version
-    else
-      BUILD_VERSION="$1"
-    fi
-    BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-
-  export BUILD_VERSION
-}
-
-setup_base_build_version() {
-  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-  # version.txt for some reason has `a` character after major.minor.rev
-  # command below yields 0.10.0 from version.txt containing 0.10.0a0
-  BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" )
-  export BUILD_VERSION
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export CC=clang CXX=clang++
-  fi
-}
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  # https://github.com/actions/checkout/issues/760#issuecomment-1097501613
-  git config --global --add safe.directory /__w/audio/audio
-  git submodule update --init --recursive
-  setup_cuda
-  setup_build_version
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.8, 3.9, 3.10)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    conda install --quiet -y pkg-config
-  else
-    case "$PYTHON_VERSION" in
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    export PATH="/opt/python/$python_abi/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
-    # CUDA and CPU are ABI compatible on the CPU-only parts, so strip in this case
-    export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f https://download.pytorch.org/whl/torch_stable.html \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS}"
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly"
-    if [[ "$OSTYPE" == "msys" ]]; then
-      export PYTORCH_VERSION="$(conda search --json -c pytorch-nightly pytorch | python -c "import sys, json; data=json.load(sys.stdin); print(data['pytorch'][-1]['version'])")"
-    else
-      export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | python3 -c "import sys, json, re; print(re.sub(r'\\+.*$', '', json.load(sys.stdin)['pytorch'][-1]['version']))")"
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-test -c pytorch-nightly"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  # TODO: Remove me later, see https://github.com/pytorch/pytorch/issues/62424 for more details
-  if [[ "$(uname)" == Darwin ]]; then
-    arch_name="$(uname -m)"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu121)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=12.1 # [not osx]"
-        ;;
-      cu118)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.8 # [not osx]"
-        ;;
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs2019
-      export VSDEVCMD_ARGS=''
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchaudio/conda_build_config.yaml
-  fi
-}
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
index 37097fae0b..6c86834f23 100644
--- a/packaging/windows/internal/cuda_install.bat
+++ b/packaging/windows/internal/cuda_install.bat
@@ -39,7 +39,7 @@ if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
     set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8"
 )
 
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.5.0.96_cuda11-archive
+set CUDNN_FOLDER=cudnn-windows-x86_64-8.7.0.84_cuda11-archive
 set CUDNN_LIB_FOLDER="lib"
 set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
 if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
diff --git a/setup.py b/setup.py
index f55a2c3f01..0a8080d06e 100644
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,8 @@ def _fetch_archives(src):
 
 
 def _fetch_third_party_libraries():
-    _init_submodule()
+    # Revert this when a submodule is added again
+    # _init_submodule()
     if os.name != "nt":
         _fetch_archives(_parse_sources())
 
@@ -154,7 +155,10 @@ def _main():
         long_description=long_description,
         long_description_content_type="text/markdown",
         url="https://github.com/pytorch/audio",
-        author="Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
+        author=(
+            "Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, "
+            "Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang"
+        ),
         author_email="soumith@pytorch.org",
         maintainer="Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang",
         maintainer_email="moto@meta.com",
diff --git a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
index bb455ee54a..874c2409d2 100644
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
@@ -76,8 +76,9 @@ def _main():
         conf = cfg["model"]
         del conf["w2v_path"]
         keep = ["_name", "task", "model"]
-        for key in list(k for k in conf["w2v_args"] if k not in keep):
-            del conf["w2v_args"][key]
+        for key in conf["w2v_args"]:
+            if key not in keep:
+                del conf["w2v_args"][key]
         conf["data"] = "/foo/bar/"
         conf["w2v_args"]["task"]["data"] = "/foo/bar"
         conf["w2v_args"]["task"]["labels"] = []
diff --git a/test/torchaudio_unittest/backend/dispatcher/ffmpeg/load_test.py b/test/torchaudio_unittest/backend/dispatcher/ffmpeg/load_test.py
index 667be0276b..4a5689df4a 100644
--- a/test/torchaudio_unittest/backend/dispatcher/ffmpeg/load_test.py
+++ b/test/torchaudio_unittest/backend/dispatcher/ffmpeg/load_test.py
@@ -7,7 +7,7 @@
 from parameterized import parameterized
 from torchaudio._backend.utils import get_load_func
 from torchaudio._internal import module_utils as _mod_utils
-from torchaudio.io._compat import _get_encoder
+from torchaudio.io._compat import _parse_save_args
 
 from torchaudio_unittest.backend.dispatcher.sox.common import name_func
 from torchaudio_unittest.common_utils import (
@@ -56,11 +56,10 @@ def assert_format(
          |
          |    1. Generate given format with Sox
          |
-         v    3. Convert to wav with FFmpeg
-        given format ----------------------> wav
-         |                                   |
-         |    2. Load with torchaudio        | 4. Load with scipy
+         + ----------------------------------+ 3. Convert to wav with FFmpeg
          |                                   |
+         |    2. Load the given format       | 4. Load with scipy
+         |       with torchaudio             |
          v                                   v
         tensor ----------> x <----------- tensor
                        5. Compare
@@ -72,7 +71,6 @@ def assert_format(
         By combining i & ii, step 2. and 4. allow for loading reference given format
         data without using torchaudio
         """
-
         path = self.get_temp_path(f"1.original.{format}")
         ref_path = self.get_temp_path("2.reference.wav")
 
@@ -91,15 +89,15 @@ def assert_format(
 
         # 3. Convert to wav with ffmpeg
         if normalize:
-            acodec = "pcm_f32le"
+            encoder = "pcm_f32le"
         else:
             encoding_map = {
                 "floating-point": "PCM_F",
                 "signed-integer": "PCM_S",
                 "unsigned-integer": "PCM_U",
             }
-            acodec = _get_encoder(data.dtype, "wav", encoding_map.get(encoding), bit_depth)
-        _convert_audio_file(path, ref_path, acodec=acodec)
+            _, encoder, _ = _parse_save_args(format, format, encoding_map.get(encoding), bit_depth)
+        _convert_audio_file(path, ref_path, encoder=encoder)
 
         # 4. Load wav with scipy
         data_ref = load_wav(ref_path, normalize=normalize)[0]
@@ -277,7 +275,7 @@ def test_opus(self, bitrate, num_channels, compression_level):
         """`self._load` can load opus file correctly."""
         ops_path = get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
         wav_path = self.get_temp_path(f"{bitrate}_{compression_level}_{num_channels}ch.opus.wav")
-        _convert_audio_file(ops_path, wav_path, acodec="pcm_f32le")
+        _convert_audio_file(ops_path, wav_path, encoder="pcm_f32le")
 
         expected, sample_rate = load_wav(wav_path)
         found, sr = self._load(ops_path)
@@ -301,15 +299,14 @@ def test_sphere(self, sample_rate, num_channels):
     @parameterized.expand(
         list(
             itertools.product(
-                ["float32", "int32", "int16"],
-                [8000, 16000],
-                [1, 2],
+                ["int16"],
+                [3, 4, 16],
                 [False, True],
             )
         ),
         name_func=name_func,
     )
-    def test_amb(self, dtype, sample_rate, num_channels, normalize):
+    def test_amb(self, dtype, num_channels, normalize, sample_rate=8000):
         """`self._load` can load amb format correctly."""
         bit_depth = sox_utils.get_bit_depth(dtype)
         encoding = sox_utils.get_encoding(dtype)
diff --git a/test/torchaudio_unittest/backend/dispatcher/ffmpeg/save_test.py b/test/torchaudio_unittest/backend/dispatcher/ffmpeg/save_test.py
index ef0e56f0e5..98120f2f4f 100644
--- a/test/torchaudio_unittest/backend/dispatcher/ffmpeg/save_test.py
+++ b/test/torchaudio_unittest/backend/dispatcher/ffmpeg/save_test.py
@@ -8,7 +8,7 @@
 import torch
 from parameterized import parameterized
 from torchaudio._backend.utils import get_save_func
-from torchaudio.io._compat import _get_encoder, _get_encoder_format
+from torchaudio.io._compat import _parse_save_args
 
 from torchaudio_unittest.backend.dispatcher.sox.common import get_enc_params, name_func
 from torchaudio_unittest.common_utils import (
@@ -24,12 +24,14 @@
 )
 
 
-def _convert_audio_file(src_path, dst_path, format=None, acodec=None):
-    command = ["ffmpeg", "-y", "-i", src_path, "-strict", "-2"]
-    if format:
-        command += ["-sample_fmt", format]
-    if acodec:
-        command += ["-acodec", acodec]
+def _convert_audio_file(src_path, dst_path, muxer=None, encoder=None, sample_fmt=None):
+    command = ["ffmpeg", "-hide_banner", "-y", "-i", src_path, "-strict", "-2"]
+    if muxer:
+        command += ["-f", muxer]
+    if encoder:
+        command += ["-acodec", encoder]
+    if sample_fmt:
+        command += ["-sample_fmt", sample_fmt]
     command += [dst_path]
     print(" ".join(command), file=sys.stderr)
     subprocess.run(command, check=True)
@@ -100,8 +102,10 @@ def assert_save_consistency(
         # 2.1. Convert the original wav to target format with torchaudio
         data = load_wav(src_path, normalize=False)[0]
         if test_mode == "path":
-            self._save(tgt_path, data, sample_rate, encoding=encoding, bits_per_sample=bits_per_sample)
+            ext = format
+            self._save(tgt_path, data, sample_rate, format=format, encoding=encoding, bits_per_sample=bits_per_sample)
         elif test_mode == "fileobj":
+            ext = None
             with open(tgt_path, "bw") as file_:
                 self._save(
                     file_,
@@ -113,6 +117,7 @@ def assert_save_consistency(
                 )
         elif test_mode == "bytesio":
             file_ = io.BytesIO()
+            ext = None
             self._save(
                 file_,
                 data,
@@ -127,16 +132,15 @@ def assert_save_consistency(
         else:
             raise ValueError(f"Unexpected test mode: {test_mode}")
         # 2.2. Convert the target format to wav with ffmpeg
-        _convert_audio_file(tgt_path, tst_path, acodec="pcm_f32le")
+        _convert_audio_file(tgt_path, tst_path, encoder="pcm_f32le")
         # 2.3. Load with SciPy
         found = load_wav(tst_path, normalize=False)[0]
 
         # 3.1. Convert the original wav to target format with ffmpeg
-        acodec = _get_encoder(data.dtype, format, encoding, bits_per_sample)
-        sample_fmt = _get_encoder_format(format, bits_per_sample)
-        _convert_audio_file(src_path, sox_path, acodec=acodec, format=sample_fmt)
+        muxer, encoder, sample_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
+        _convert_audio_file(src_path, sox_path, muxer=muxer, encoder=encoder, sample_fmt=sample_fmt)
         # 3.2. Convert the target format to wav with ffmpeg
-        _convert_audio_file(sox_path, ref_path, acodec="pcm_f32le")
+        _convert_audio_file(sox_path, ref_path, encoder="pcm_f32le")
         # 3.3. Load with SciPy
         expected = load_wav(ref_path, normalize=False)[0]
 
diff --git a/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py b/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py
index 3721b42a26..58c6098a2c 100644
--- a/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py
+++ b/test/torchaudio_unittest/backend/dispatcher/sox/load_test.py
@@ -2,7 +2,6 @@
 from functools import partial
 
 import torch
-import torchaudio
 from parameterized import parameterized
 from torchaudio._backend.utils import get_load_func
 from torchaudio_unittest.common_utils import (
@@ -316,13 +315,6 @@ def test_sox(self, frame_offset, num_frames, channels_first, normalize):
 
         self._test(torch.ops.torchaudio.sox_io_load_audio_file, frame_offset, num_frames, channels_first, normalize)
 
-        # test file-like obj
-        def func(path, *args):
-            with open(path, "rb") as fileobj:
-                return torchaudio.lib._torchaudio_sox.load_audio_fileobj(fileobj, *args)
-
-        self._test(func, frame_offset, num_frames, channels_first, normalize)
-
     @nested_params(
         [0, 1, 10, 100, 1000],
         [-1, 1, 10, 100, 1000],
diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py
index 70532f4ba0..eea0313314 100644
--- a/test/torchaudio_unittest/backend/sox_io/info_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/info_test.py
@@ -1,22 +1,14 @@
-import io
 import itertools
-import os
-import tarfile
-from contextlib import contextmanager
 
 from parameterized import parameterized
-from torchaudio._internal import module_utils as _mod_utils
 from torchaudio.backend import sox_io_backend
-from torchaudio.utils.sox_utils import get_buffer_size, set_buffer_size
-from torchaudio_unittest.backend.common import get_bits_per_sample, get_encoding
+from torchaudio_unittest.backend.common import get_encoding
 from torchaudio_unittest.common_utils import (
     get_asset_path,
     get_wav_data,
-    HttpServerMixin,
     PytorchTestCase,
     save_wav,
     skipIfNoExec,
-    skipIfNoModule,
     skipIfNoSox,
     sox_utils,
     TempDirMixin,
@@ -25,10 +17,6 @@
 from .common import name_func
 
 
-if _mod_utils.is_module_available("requests"):
-    import requests
-
-
 @skipIfNoExec("sox")
 @skipIfNoSox
 class TestInfo(TempDirMixin, PytorchTestCase):
@@ -329,268 +317,6 @@ def test_mp3(self):
         assert sinfo.bits_per_sample == 0  # bit_per_sample is irrelevant for compressed formats
         assert sinfo.encoding == "MP3"
 
-        with open(path, "rb") as fileobj:
-            sinfo = sox_io_backend.info(fileobj, format="mp3")
-        assert sinfo.sample_rate == 16000
-        assert sinfo.num_frames == 80000
-        assert sinfo.num_channels == 1
-        assert sinfo.bits_per_sample == 0
-        assert sinfo.encoding == "MP3"
-
-
-class FileObjTestBase(TempDirMixin):
-    def _gen_file(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None):
-        path = self.get_temp_path(f"test.{ext}")
-        bit_depth = sox_utils.get_bit_depth(dtype)
-        duration = num_frames / sample_rate
-        comment_file = self._gen_comment_file(comments) if comments else None
-
-        sox_utils.gen_audio_file(
-            path,
-            sample_rate,
-            num_channels=num_channels,
-            encoding=sox_utils.get_encoding(dtype),
-            bit_depth=bit_depth,
-            duration=duration,
-            comment_file=comment_file,
-        )
-        return path
-
-    def _gen_comment_file(self, comments):
-        comment_path = self.get_temp_path("comment.txt")
-        with open(comment_path, "w") as file_:
-            file_.writelines(comments)
-        return comment_path
-
-
-class Unseekable:
-    def __init__(self, fileobj):
-        self.fileobj = fileobj
-
-    def read(self, n):
-        return self.fileobj.read(n)
-
-
-@skipIfNoSox
-@skipIfNoExec("sox")
-class TestFileObject(FileObjTestBase, PytorchTestCase):
-    def _query_fileobj(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None):
-        path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames, comments=comments)
-        format_ = ext if ext in ["mp3"] else None
-        with open(path, "rb") as fileobj:
-            return sox_io_backend.info(fileobj, format_)
-
-    def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames):
-        path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
-        format_ = ext if ext in ["mp3"] else None
-        with open(path, "rb") as file_:
-            fileobj = io.BytesIO(file_.read())
-        return sox_io_backend.info(fileobj, format_)
-
-    def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames):
-        audio_path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
-        audio_file = os.path.basename(audio_path)
-        archive_path = self.get_temp_path("archive.tar.gz")
-        with tarfile.TarFile(archive_path, "w") as tarobj:
-            tarobj.add(audio_path, arcname=audio_file)
-        format_ = ext if ext in ["mp3"] else None
-        with tarfile.TarFile(archive_path, "r") as tarobj:
-            fileobj = tarobj.extractfile(audio_file)
-            return sox_io_backend.info(fileobj, format_)
-
-    @contextmanager
-    def _set_buffer_size(self, buffer_size):
-        try:
-            original_buffer_size = get_buffer_size()
-            set_buffer_size(buffer_size)
-            yield
-        finally:
-            set_buffer_size(original_buffer_size)
-
-    @parameterized.expand(
-        [
-            ("wav", "float32"),
-            ("wav", "int32"),
-            ("wav", "int16"),
-            ("wav", "uint8"),
-            ("mp3", "float32"),
-            ("flac", "float32"),
-            ("vorbis", "float32"),
-            ("amb", "int16"),
-        ]
-    )
-    def test_fileobj(self, ext, dtype):
-        """Querying audio via file object works"""
-        sample_rate = 16000
-        num_frames = 3 * sample_rate
-        num_channels = 2
-        sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels, num_frames)
-
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = {"vorbis": 0, "mp3": 49536}.get(ext, num_frames)
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
-    @parameterized.expand(
-        [
-            ("vorbis", "float32"),
-        ]
-    )
-    def test_fileobj_large_header(self, ext, dtype):
-        """
-        For audio file with header size exceeding default buffer size:
-        - Querying audio via file object without enlarging buffer size fails.
-        - Querying audio via file object after enlarging buffer size succeeds.
-        """
-        sample_rate = 16000
-        num_frames = 3 * sample_rate
-        num_channels = 2
-        comments = "metadata=" + " ".join(["value" for _ in range(1000)])
-
-        with self.assertRaises(RuntimeError):
-            sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels, num_frames, comments=comments)
-
-        with self._set_buffer_size(16384):
-            sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels, num_frames, comments=comments)
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = 0 if ext in ["vorbis"] else num_frames
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
-    @parameterized.expand(
-        [
-            ("wav", "float32"),
-            ("wav", "int32"),
-            ("wav", "int16"),
-            ("wav", "uint8"),
-            ("mp3", "float32"),
-            ("flac", "float32"),
-            ("vorbis", "float32"),
-            ("amb", "int16"),
-        ]
-    )
-    def test_bytesio(self, ext, dtype):
-        """Querying audio via ByteIO object works for small data"""
-        sample_rate = 16000
-        num_frames = 3 * sample_rate
-        num_channels = 2
-        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames)
-
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = {"vorbis": 0, "mp3": 49536}.get(ext, num_frames)
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
-    @parameterized.expand(
-        [
-            ("wav", "float32"),
-            ("wav", "int32"),
-            ("wav", "int16"),
-            ("wav", "uint8"),
-            ("mp3", "float32"),
-            ("flac", "float32"),
-            ("vorbis", "float32"),
-            ("amb", "int16"),
-        ]
-    )
-    def test_bytesio_tiny(self, ext, dtype):
-        """Querying audio via ByteIO object works for small data"""
-        sample_rate = 8000
-        num_frames = 4
-        num_channels = 2
-        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames)
-
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = {"vorbis": 0, "mp3": 1728}.get(ext, num_frames)
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
-    @parameterized.expand(
-        [
-            ("wav", "float32"),
-            ("wav", "int32"),
-            ("wav", "int16"),
-            ("wav", "uint8"),
-            ("mp3", "float32"),
-            ("flac", "float32"),
-            ("vorbis", "float32"),
-            ("amb", "int16"),
-        ]
-    )
-    def test_tarfile(self, ext, dtype):
-        """Querying compressed audio via file-like object works"""
-        sample_rate = 16000
-        num_frames = 3.0 * sample_rate
-        num_channels = 2
-        sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels, num_frames)
-
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = {"vorbis": 0, "mp3": 49536}.get(ext, num_frames)
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
-
-@skipIfNoSox
-@skipIfNoExec("sox")
-@skipIfNoModule("requests")
-class TestFileObjectHttp(HttpServerMixin, FileObjTestBase, PytorchTestCase):
-    def _query_http(self, ext, dtype, sample_rate, num_channels, num_frames):
-        audio_path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
-        audio_file = os.path.basename(audio_path)
-
-        url = self.get_url(audio_file)
-        format_ = ext if ext in ["mp3"] else None
-        with requests.get(url, stream=True) as resp:
-            return sox_io_backend.info(Unseekable(resp.raw), format=format_)
-
-    @parameterized.expand(
-        [
-            ("wav", "float32"),
-            ("wav", "int32"),
-            ("wav", "int16"),
-            ("wav", "uint8"),
-            ("mp3", "float32"),
-            ("flac", "float32"),
-            ("vorbis", "float32"),
-            ("amb", "int16"),
-        ]
-    )
-    def test_requests(self, ext, dtype):
-        """Querying compressed audio via requests works"""
-        sample_rate = 16000
-        num_frames = 3.0 * sample_rate
-        num_channels = 2
-        sinfo = self._query_http(ext, dtype, sample_rate, num_channels, num_frames)
-
-        bits_per_sample = get_bits_per_sample(ext, dtype)
-        num_frames = {"vorbis": 0, "mp3": 49536}.get(ext, num_frames)
-
-        assert sinfo.sample_rate == sample_rate
-        assert sinfo.num_channels == num_channels
-        assert sinfo.num_frames == num_frames
-        assert sinfo.bits_per_sample == bits_per_sample
-        assert sinfo.encoding == get_encoding(ext, dtype)
-
 
 @skipIfNoSox
 class TestInfoNoSuchFile(PytorchTestCase):
diff --git a/test/torchaudio_unittest/backend/sox_io/load_test.py b/test/torchaudio_unittest/backend/sox_io/load_test.py
index 54cfd7b7ae..ea16803077 100644
--- a/test/torchaudio_unittest/backend/sox_io/load_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/load_test.py
@@ -1,22 +1,16 @@
-import io
 import itertools
-import tarfile
 
 import torch
-import torchaudio
 from parameterized import parameterized
-from torchaudio._internal import module_utils as _mod_utils
 from torchaudio.backend import sox_io_backend
 from torchaudio_unittest.common_utils import (
     get_asset_path,
     get_wav_data,
-    HttpServerMixin,
     load_wav,
     nested_params,
     PytorchTestCase,
     save_wav,
     skipIfNoExec,
-    skipIfNoModule,
     skipIfNoSox,
     sox_utils,
     TempDirMixin,
@@ -25,10 +19,6 @@
 from .common import name_func
 
 
-if _mod_utils.is_module_available("requests"):
-    import requests
-
-
 class LoadTestBase(TempDirMixin, PytorchTestCase):
     def assert_format(
         self,
@@ -322,13 +312,6 @@ def test_sox(self, frame_offset, num_frames, channels_first, normalize):
 
         self._test(torch.ops.torchaudio.sox_io_load_audio_file, frame_offset, num_frames, channels_first, normalize)
 
-        # test file-like obj
-        def func(path, *args):
-            with open(path, "rb") as fileobj:
-                return torchaudio.lib._torchaudio_sox.load_audio_fileobj(fileobj, *args)
-
-        self._test(func, frame_offset, num_frames, channels_first, normalize)
-
     @nested_params(
         [0, 1, 10, 100, 1000],
         [-1, 1, 10, 100, 1000],
@@ -365,263 +348,6 @@ def test_mp3(self):
         _, sr = sox_io_backend.load(path)
         assert sr == 16000
 
-        with open(path, "rb") as fileobj:
-            _, sr = sox_io_backend.load(fileobj)
-        assert sr == 16000
-
-
-class CloggedFileObj:
-    def __init__(self, fileobj):
-        self.fileobj = fileobj
-
-    def read(self, _):
-        return self.fileobj.read(2)
-
-    def seek(self, offset, whence):
-        return self.fileobj.seek(offset, whence)
-
-
-@skipIfNoSox
-@skipIfNoExec("sox")
-class TestFileObject(TempDirMixin, PytorchTestCase):
-    """
-    In this test suite, the result of file-like object input is compared against file path input,
-    because `load` function is rigrously tested for file path inputs to match libsox's result,
-    """
-
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_fileobj(self, ext, kwargs):
-        """Loading audio via file object returns the same result as via file path."""
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        path = self.get_temp_path(f"test.{ext}")
-
-        sox_utils.gen_audio_file(path, sample_rate, num_channels=2, **kwargs)
-        expected, _ = sox_io_backend.load(path)
-
-        with open(path, "rb") as fileobj:
-            found, sr = sox_io_backend.load(fileobj, format=format_)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_bytesio(self, ext, kwargs):
-        """Loading audio via BytesIO object returns the same result as via file path."""
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        path = self.get_temp_path(f"test.{ext}")
-
-        sox_utils.gen_audio_file(path, sample_rate, num_channels=2, **kwargs)
-        expected, _ = sox_io_backend.load(path)
-
-        with open(path, "rb") as file_:
-            fileobj = io.BytesIO(file_.read())
-        found, sr = sox_io_backend.load(fileobj, format=format_)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_bytesio_clogged(self, ext, kwargs):
-        """Loading audio via clogged file object returns the same result as via file path.
-
-        This test case validates the case where fileobject returns shorter bytes than requeted.
-        """
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        path = self.get_temp_path(f"test.{ext}")
-
-        sox_utils.gen_audio_file(path, sample_rate, num_channels=2, **kwargs)
-        expected, _ = sox_io_backend.load(path)
-
-        with open(path, "rb") as file_:
-            fileobj = CloggedFileObj(io.BytesIO(file_.read()))
-        found, sr = sox_io_backend.load(fileobj, format=format_)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_bytesio_tiny(self, ext, kwargs):
-        """Loading very small audio via file object returns the same result as via file path."""
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        path = self.get_temp_path(f"test.{ext}")
-
-        sox_utils.gen_audio_file(path, sample_rate, num_channels=2, duration=1 / 1600, **kwargs)
-        expected, _ = sox_io_backend.load(path)
-
-        with open(path, "rb") as file_:
-            fileobj = io.BytesIO(file_.read())
-        found, sr = sox_io_backend.load(fileobj, format=format_)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_tarfile(self, ext, kwargs):
-        """Loading compressed audio via file-like object returns the same result as via file path."""
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        audio_file = f"test.{ext}"
-        audio_path = self.get_temp_path(audio_file)
-        archive_path = self.get_temp_path("archive.tar.gz")
-
-        sox_utils.gen_audio_file(audio_path, sample_rate, num_channels=2, **kwargs)
-        expected, _ = sox_io_backend.load(audio_path)
-
-        with tarfile.TarFile(archive_path, "w") as tarobj:
-            tarobj.add(audio_path, arcname=audio_file)
-        with tarfile.TarFile(archive_path, "r") as tarobj:
-            fileobj = tarobj.extractfile(audio_file)
-            found, sr = sox_io_backend.load(fileobj, format=format_)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
-
-class Unseekable:
-    def __init__(self, fileobj):
-        self.fileobj = fileobj
-
-    def read(self, n):
-        return self.fileobj.read(n)
-
-
-@skipIfNoSox
-@skipIfNoExec("sox")
-@skipIfNoModule("requests")
-class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
-    @parameterized.expand(
-        [
-            ("wav", {"bit_depth": 16}),
-            ("wav", {"bit_depth": 24}),
-            ("wav", {"bit_depth": 32}),
-            ("mp3", {"compression": 128}),
-            ("mp3", {"compression": 320}),
-            ("flac", {"compression": 0}),
-            ("flac", {"compression": 5}),
-            ("flac", {"compression": 8}),
-            ("vorbis", {"compression": -1}),
-            ("vorbis", {"compression": 10}),
-            ("amb", {}),
-        ]
-    )
-    def test_requests(self, ext, kwargs):
-        sample_rate = 16000
-        format_ = ext if ext in ["mp3"] else None
-        audio_file = f"test.{ext}"
-        audio_path = self.get_temp_path(audio_file)
-
-        sox_utils.gen_audio_file(audio_path, sample_rate, num_channels=2, **kwargs)
-        expected, _ = sox_io_backend.load(audio_path)
-
-        url = self.get_url(audio_file)
-        with requests.get(url, stream=True) as resp:
-            found, sr = sox_io_backend.load(Unseekable(resp.raw), format=format_)
-
-        assert sr == sample_rate
-        if ext != "mp3":
-            self.assertEqual(expected, found)
-
-    @parameterized.expand(
-        list(
-            itertools.product(
-                [0, 1, 10, 100, 1000],
-                [-1, 1, 10, 100, 1000],
-            )
-        ),
-        name_func=name_func,
-    )
-    def test_frame(self, frame_offset, num_frames):
-        """num_frames and frame_offset correctly specify the region of data"""
-        sample_rate = 8000
-        audio_file = "test.wav"
-        audio_path = self.get_temp_path(audio_file)
-
-        original = get_wav_data("float32", num_channels=2)
-        save_wav(audio_path, original, sample_rate)
-        frame_end = None if num_frames == -1 else frame_offset + num_frames
-        expected = original[:, frame_offset:frame_end]
-
-        url = self.get_url(audio_file)
-        with requests.get(url, stream=True) as resp:
-            found, sr = sox_io_backend.load(resp.raw, frame_offset, num_frames)
-
-        assert sr == sample_rate
-        self.assertEqual(expected, found)
-
 
 @skipIfNoSox
 class TestLoadNoSuchFile(PytorchTestCase):
diff --git a/test/torchaudio_unittest/backend/sox_io/save_test.py b/test/torchaudio_unittest/backend/sox_io/save_test.py
index 5db7a5a9f8..a92b3d98cb 100644
--- a/test/torchaudio_unittest/backend/sox_io/save_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/save_test.py
@@ -1,4 +1,3 @@
-import io
 import os
 
 import torch
@@ -43,7 +42,6 @@ def assert_save_consistency(
         num_channels: int = 2,
         num_frames: float = 3 * 8000,
         src_dtype: str = "int32",
-        test_mode: str = "path",
     ):
         """`save` function produces file that is comparable with `sox` command
 
@@ -97,37 +95,9 @@ def assert_save_consistency(
 
         # 2.1. Convert the original wav to target format with torchaudio
         data = load_wav(src_path, normalize=False)[0]
-        if test_mode == "path":
-            sox_io_backend.save(
-                tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample
-            )
-        elif test_mode == "fileobj":
-            with open(tgt_path, "bw") as file_:
-                sox_io_backend.save(
-                    file_,
-                    data,
-                    sample_rate,
-                    format=format,
-                    compression=compression,
-                    encoding=encoding,
-                    bits_per_sample=bits_per_sample,
-                )
-        elif test_mode == "bytesio":
-            file_ = io.BytesIO()
-            sox_io_backend.save(
-                file_,
-                data,
-                sample_rate,
-                format=format,
-                compression=compression,
-                encoding=encoding,
-                bits_per_sample=bits_per_sample,
-            )
-            file_.seek(0)
-            with open(tgt_path, "bw") as f:
-                f.write(file_.read())
-        else:
-            raise ValueError(f"Unexpected test mode: {test_mode}")
+        sox_io_backend.save(
+            tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample
+        )
         # 2.2. Convert the target format to wav with sox
         sox_utils.convert_audio_file(tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
         # 2.3. Load with SciPy
@@ -150,7 +120,6 @@ def assert_save_consistency(
 @skipIfNoSox
 class SaveTest(SaveTestBase):
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             ("PCM_U", 8),
             ("PCM_S", 16),
@@ -161,12 +130,11 @@ class SaveTest(SaveTestBase):
             ("ALAW", 8),
         ],
     )
-    def test_save_wav(self, test_mode, enc_params):
+    def test_save_wav(self, enc_params):
         encoding, bits_per_sample = enc_params
-        self.assert_save_consistency("wav", encoding=encoding, bits_per_sample=bits_per_sample, test_mode=test_mode)
+        self.assert_save_consistency("wav", encoding=encoding, bits_per_sample=bits_per_sample)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             ("float32",),
             ("int32",),
@@ -174,12 +142,11 @@ def test_save_wav(self, test_mode, enc_params):
             ("uint8",),
         ],
     )
-    def test_save_wav_dtype(self, test_mode, params):
+    def test_save_wav_dtype(self, params):
         (dtype,) = params
-        self.assert_save_consistency("wav", src_dtype=dtype, test_mode=test_mode)
+        self.assert_save_consistency("wav", src_dtype=dtype)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [8, 16, 24],
         [
             None,
@@ -194,19 +161,13 @@ def test_save_wav_dtype(self, test_mode, params):
             8,
         ],
     )
-    def test_save_flac(self, test_mode, bits_per_sample, compression_level):
-        self.assert_save_consistency(
-            "flac", compression=compression_level, bits_per_sample=bits_per_sample, test_mode=test_mode
-        )
+    def test_save_flac(self, bits_per_sample, compression_level):
+        self.assert_save_consistency("flac", compression=compression_level, bits_per_sample=bits_per_sample)
 
-    @nested_params(
-        ["path", "fileobj", "bytesio"],
-    )
-    def test_save_htk(self, test_mode):
-        self.assert_save_consistency("htk", test_mode=test_mode, num_channels=1)
+    def test_save_htk(self):
+        self.assert_save_consistency("htk", num_channels=1)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             None,
             -1,
@@ -219,11 +180,10 @@ def test_save_htk(self, test_mode):
             10,
         ],
     )
-    def test_save_vorbis(self, test_mode, quality_level):
-        self.assert_save_consistency("vorbis", compression=quality_level, test_mode=test_mode)
+    def test_save_vorbis(self, quality_level):
+        self.assert_save_consistency("vorbis", compression=quality_level)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             (
                 "PCM_S",
@@ -248,12 +208,11 @@ def test_save_vorbis(self, test_mode, quality_level):
             ("ALAW", 32),
         ],
     )
-    def test_save_sphere(self, test_mode, enc_params):
+    def test_save_sphere(self, enc_params):
         encoding, bits_per_sample = enc_params
-        self.assert_save_consistency("sph", encoding=encoding, bits_per_sample=bits_per_sample, test_mode=test_mode)
+        self.assert_save_consistency("sph", encoding=encoding, bits_per_sample=bits_per_sample)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             (
                 "PCM_U",
@@ -289,12 +248,11 @@ def test_save_sphere(self, test_mode, enc_params):
             ),
         ],
     )
-    def test_save_amb(self, test_mode, enc_params):
+    def test_save_amb(self, enc_params):
         encoding, bits_per_sample = enc_params
-        self.assert_save_consistency("amb", encoding=encoding, bits_per_sample=bits_per_sample, test_mode=test_mode)
+        self.assert_save_consistency("amb", encoding=encoding, bits_per_sample=bits_per_sample)
 
     @nested_params(
-        ["path", "fileobj", "bytesio"],
         [
             None,
             0,
@@ -307,18 +265,15 @@ def test_save_amb(self, test_mode, enc_params):
             7,
         ],
     )
-    def test_save_amr_nb(self, test_mode, bit_rate):
-        self.assert_save_consistency("amr-nb", compression=bit_rate, num_channels=1, test_mode=test_mode)
+    def test_save_amr_nb(self, bit_rate):
+        self.assert_save_consistency("amr-nb", compression=bit_rate, num_channels=1)
 
-    @nested_params(
-        ["path", "fileobj", "bytesio"],
-    )
-    def test_save_gsm(self, test_mode):
-        self.assert_save_consistency("gsm", num_channels=1, test_mode=test_mode)
+    def test_save_gsm(self):
+        self.assert_save_consistency("gsm", num_channels=1)
         with self.assertRaises(RuntimeError, msg="gsm format only supports single channel audio."):
-            self.assert_save_consistency("gsm", num_channels=2, test_mode=test_mode)
+            self.assert_save_consistency("gsm", num_channels=2)
         with self.assertRaises(RuntimeError, msg="gsm format only supports a sampling rate of 8kHz."):
-            self.assert_save_consistency("gsm", sample_rate=16000, test_mode=test_mode)
+            self.assert_save_consistency("gsm", sample_rate=16000)
 
     @parameterized.expand(
         [
diff --git a/test/torchaudio_unittest/backend/sox_io/smoke_test.py b/test/torchaudio_unittest/backend/sox_io/smoke_test.py
index e394161044..01e4305661 100644
--- a/test/torchaudio_unittest/backend/sox_io/smoke_test.py
+++ b/test/torchaudio_unittest/backend/sox_io/smoke_test.py
@@ -1,4 +1,3 @@
-import io
 import itertools
 
 from parameterized import parameterized
@@ -89,88 +88,3 @@ def test_vorbis(self, sample_rate, num_channels, quality_level):
     def test_flac(self, sample_rate, num_channels, compression_level):
         """Run smoke test on flac format"""
         self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level)
-
-
-@skipIfNoSox
-class SmokeTestFileObj(TorchaudioTestCase):
-    """Run smoke test on various audio format
-
-    The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
-    abnormal behaviors.
-
-    This test suite should be able to run without any additional tools (such as sox command),
-    however without such tools, the correctness of each function cannot be verified.
-    """
-
-    def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype="float32"):
-        duration = 1
-        num_frames = sample_rate * duration
-        original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames)
-
-        fileobj = io.BytesIO()
-        # 1. run save
-        sox_io_backend.save(fileobj, original, sample_rate, compression=compression, format=ext)
-        # 2. run info
-        fileobj.seek(0)
-        info = sox_io_backend.info(fileobj, format=ext)
-        assert info.sample_rate == sample_rate
-        assert info.num_channels == num_channels
-        # 3. run load
-        fileobj.seek(0)
-        loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
-        assert sr == sample_rate
-        assert loaded.shape[0] == num_channels
-
-    @parameterized.expand(
-        list(
-            itertools.product(
-                ["float32", "int32", "int16", "uint8"],
-                [8000, 16000],
-                [1, 2],
-            )
-        ),
-        name_func=name_func,
-    )
-    def test_wav(self, dtype, sample_rate, num_channels):
-        """Run smoke test on wav format"""
-        self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
-
-    @parameterized.expand(
-        list(
-            itertools.product(
-                [8000, 16000],
-                [1, 2],
-                [-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
-            )
-        )
-    )
-    def test_mp3(self, sample_rate, num_channels, bit_rate):
-        """Run smoke test on mp3 format"""
-        self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
-
-    @parameterized.expand(
-        list(
-            itertools.product(
-                [8000, 16000],
-                [1, 2],
-                [-1, 0, 1, 2, 3, 3.6, 5, 10],
-            )
-        )
-    )
-    def test_vorbis(self, sample_rate, num_channels, quality_level):
-        """Run smoke test on vorbis format"""
-        self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
-
-    @parameterized.expand(
-        list(
-            itertools.product(
-                [8000, 16000],
-                [1, 2],
-                list(range(9)),
-            )
-        ),
-        name_func=name_func,
-    )
-    def test_flac(self, sample_rate, num_channels, compression_level):
-        """Run smoke test on flac format"""
-        self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level)
diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py
index 9067c5f9cd..8c66f2149a 100644
--- a/test/torchaudio_unittest/common_utils/__init__.py
+++ b/test/torchaudio_unittest/common_utils/__init__.py
@@ -1,6 +1,7 @@
 from .autograd_utils import use_deterministic_algorithms
 from .backend_utils import set_audio_backend
 from .case_utils import (
+    disabledInCI,
     HttpServerMixin,
     is_ffmpeg_available,
     PytorchTestCase,
@@ -12,7 +13,6 @@
     skipIfNoExec,
     skipIfNoFFmpeg,
     skipIfNoHWAccel,
-    skipIfNoKaldi,
     skipIfNoMacOS,
     skipIfNoModule,
     skipIfNoQengine,
@@ -51,7 +51,6 @@
     "skipIfNoExec",
     "skipIfNoMacOS",
     "skipIfNoModule",
-    "skipIfNoKaldi",
     "skipIfNoRIR",
     "skipIfNoSox",
     "skipIfNoSoxBackend",
@@ -60,6 +59,7 @@
     "skipIfNoFFmpeg",
     "skipIfNoHWAccel",
     "skipIfPy310",
+    "disabledInCI",
     "get_wav_data",
     "normalize_wav",
     "load_wav",
diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py
index ea88b45fb8..249ca6d98f 100644
--- a/test/torchaudio_unittest/common_utils/case_utils.py
+++ b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -234,11 +234,6 @@ def skipIfNoModule(module, display_name=None):
     reason="Sox features are not available.",
     key="NO_SOX",
 )
-skipIfNoKaldi = _skipIf(
-    not torchaudio._extension._IS_KALDI_AVAILABLE,
-    reason="Kaldi features are not available.",
-    key="NO_KALDI",
-)
 skipIfNoRIR = _skipIf(
     not torchaudio._extension._IS_RIR_AVAILABLE,
     reason="RIR features are not available.",
@@ -287,6 +282,11 @@ def skipIfNoModule(module, display_name=None):
     reason="This feature is only available for MacOS.",
     key="NO_MACOS",
 )
+disabledInCI = _skipIf(
+    "CI" in os.environ,
+    reason="Tests are failing on CI consistently. Disabled while investigating.",
+    key="TEMPORARY_DISABLED",
+)
 
 
 def skipIfNoHWAccel(name):
diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
index eee49e62cb..24553a833c 100644
--- a/test/torchaudio_unittest/functional/batch_consistency_test.py
+++ b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -257,18 +257,6 @@ def test_resample_waveform(self, resampling_method):
             atol=1e-7,
         )
 
-    @common_utils.skipIfNoKaldi
-    def test_compute_kaldi_pitch(self):
-        sample_rate = 44100
-        n_channels = 2
-        waveform = common_utils.get_whitenoise(sample_rate=sample_rate, n_channels=self.batch_size * n_channels)
-        batch = waveform.view(self.batch_size, n_channels, waveform.size(-1))
-        kwargs = {
-            "sample_rate": sample_rate,
-        }
-        func = partial(F.compute_kaldi_pitch, **kwargs)
-        self.assert_batch_consistency(func, inputs=(batch,))
-
     def test_lfilter(self):
         signal_length = 2048
         x = torch.randn(self.batch_size, signal_length)
diff --git a/test/torchaudio_unittest/functional/functional_impl.py b/test/torchaudio_unittest/functional/functional_impl.py
index d7847c034f..6bb6a9f8bf 100644
--- a/test/torchaudio_unittest/functional/functional_impl.py
+++ b/test/torchaudio_unittest/functional/functional_impl.py
@@ -1116,55 +1116,60 @@ def test_preemphasis_deemphasis_roundtrip(self, input_shape, coeff):
 
     @parameterized.expand(
         [
-            ([0, 1, 1, 0], [0, 1, 5, 1, 0], torch.int32),
-            ([0, 1, 2, 3, 4], [0, 1, 2, 3, 4], torch.int32),
-            ([3, 3, 3], [3, 5, 3, 5, 3], torch.int64),
-            ([0, 1, 2], [0, 1, 1, 1, 2], torch.int64),
+            ([[0, 1, 1, 0]], [[0, 1, 5, 1, 0]], torch.int32),
+            ([[0, 1, 2, 3, 4]], [[0, 1, 2, 3, 4]], torch.int32),
+            ([[3, 3, 3]], [[3, 5, 3, 5, 3]], torch.int64),
+            ([[0, 1, 2]], [[0, 1, 1, 1, 2]], torch.int64),
         ]
     )
     def test_forced_align(self, targets, ref_path, targets_dtype):
         emission = torch.tensor(
             [
-                [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
-                [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
-                [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
-                [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
-                [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107],
+                [
+                    [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+                    [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
+                    [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+                    [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+                    [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107],
+                ]
             ],
             dtype=self.dtype,
             device=self.device,
         )
         blank = 5
+        batch_index = 0
         ref_path = torch.tensor(ref_path, dtype=targets_dtype, device=self.device)
         ref_scores = torch.tensor(
-            [torch.log(emission[i, ref_path[i]]).item() for i in range(emission.shape[0])],
+            [torch.log(emission[batch_index, i, ref_path[batch_index, i]]).item() for i in range(emission.shape[1])],
             dtype=emission.dtype,
             device=self.device,
-        )
+        ).unsqueeze(0)
         log_probs = torch.log(emission)
         targets = torch.tensor(targets, dtype=targets_dtype, device=self.device)
-        input_lengths = torch.tensor((log_probs.shape[0]))
-        target_lengths = torch.tensor((targets.shape[0]))
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+        assert hyp_path.shape == ref_path.shape
+        assert hyp_scores.shape == ref_scores.shape
         self.assertEqual(hyp_path, ref_path)
         self.assertEqual(hyp_scores, ref_scores)
 
     @parameterized.expand([(torch.int32,), (torch.int64,)])
     def test_forced_align_fail(self, targets_dtype):
-        log_probs = torch.rand(5, 6, dtype=self.dtype, device=self.device)
-        targets = torch.tensor([0, 1, 2, 3, 4, 4], dtype=targets_dtype, device=self.device)
+        log_probs = torch.rand(1, 5, 6, dtype=self.dtype, device=self.device)
+        targets = torch.tensor([[0, 1, 2, 3, 4, 4]], dtype=targets_dtype, device=self.device)
         blank = 5
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"targets length is too long for CTC"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([5, 3, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[5, 3, 3]], dtype=targets_dtype, device=self.device)
         with self.assertRaisesRegex(ValueError, r"targets Tensor shouldn't contain blank index"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
         log_probs = log_probs.int()
-        targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"log_probs must be float64, float32 or float16"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
@@ -1175,40 +1180,42 @@ def test_forced_align_fail(self, targets_dtype):
 
         log_probs = torch.rand(3, 4, 6, dtype=self.dtype, device=self.device)
         targets = targets.int()
-        with self.assertRaisesRegex(RuntimeError, r"3-D tensor is not yet supported for log_probs"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"The batch dimension for log_probs must be 1 at the current version"
+        ):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
         targets = torch.randint(0, 4, (3, 4), device=self.device)
-        log_probs = torch.rand(3, 6, dtype=self.dtype, device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"2-D tensor is not yet supported for targets"):
+        log_probs = torch.rand(1, 3, 6, dtype=self.dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"The batch dimension for targets must be 1 at the current version"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device)
-        input_lengths = torch.randint(1, 5, (3,), device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 0-D"):
+        targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device)
+        input_lengths = torch.randint(1, 5, (3, 5), device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 1-D"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.randint(1, 5, (3,), device=self.device)
-        with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 0-D"):
+        input_lengths = torch.tensor([log_probs.shape[0]], device=self.device)
+        target_lengths = torch.randint(1, 5, (3, 5), device=self.device)
+        with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 1-D"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((10000), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([10000], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"input length mismatch"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        input_lengths = torch.tensor((log_probs.shape[0]))
-        target_lengths = torch.tensor((10000))
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([10000], device=self.device)
         with self.assertRaisesRegex(RuntimeError, r"target length mismatch"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([7, 8, 9, 10], dtype=targets_dtype, device=self.device)
-        log_probs = torch.rand(10, 5, dtype=self.dtype, device=self.device)
+        targets = torch.tensor([[7, 8, 9, 10]], dtype=targets_dtype, device=self.device)
+        log_probs = torch.rand(1, 10, 5, dtype=self.dtype, device=self.device)
         with self.assertRaisesRegex(ValueError, r"targets values must be less than the CTC dimension"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
 
-        targets = torch.tensor([1, 3, 3], dtype=targets_dtype, device=self.device)
+        targets = torch.tensor([[1, 3, 3]], dtype=targets_dtype, device=self.device)
         blank = 10000
         with self.assertRaisesRegex(RuntimeError, r"blank must be within \[0, num classes\)"):
             hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
@@ -1238,14 +1245,14 @@ class FunctionalCUDAOnly(TestBaseMixin):
     @nested_params(
         [torch.half, torch.float, torch.double],
         [torch.int32, torch.int64],
-        [(50, 100), (100, 100)],
-        [(10,), (40,), (45,)],
+        [(1, 50, 100), (1, 100, 100)],
+        [(1, 10), (1, 40), (1, 45)],
     )
     def test_forced_align_same_result(self, log_probs_dtype, targets_dtype, log_probs_shape, targets_shape):
         log_probs = torch.rand(log_probs_shape, dtype=log_probs_dtype, device=self.device)
         targets = torch.randint(1, 100, targets_shape, dtype=targets_dtype, device=self.device)
-        input_lengths = torch.tensor((log_probs.shape[0]), device=self.device)
-        target_lengths = torch.tensor((targets.shape[0]), device=self.device)
+        input_lengths = torch.tensor([log_probs.shape[1]], device=self.device)
+        target_lengths = torch.tensor([targets.shape[1]], device=self.device)
         log_probs_cuda = log_probs.cuda()
         targets_cuda = targets.cuda()
         input_lengths_cuda = input_lengths.cuda()
diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py b/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py
index 2c2a0de8e0..ef7b52037c 100644
--- a/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py
+++ b/test/torchaudio_unittest/functional/kaldi_compatibility_cpu_test.py
@@ -1,12 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .kaldi_compatibility_test_impl import Kaldi, KaldiCPUOnly
-
-
-class TestKaldiCPUOnly(KaldiCPUOnly, PytorchTestCase):
-    dtype = torch.float32
-    device = torch.device("cpu")
+from .kaldi_compatibility_test_impl import Kaldi
 
 
 class TestKaldiFloat32(Kaldi, PytorchTestCase):
diff --git a/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py b/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py
index d6b8b86180..d87b463b9a 100644
--- a/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py
+++ b/test/torchaudio_unittest/functional/kaldi_compatibility_test_impl.py
@@ -1,14 +1,6 @@
 import torch
 import torchaudio.functional as F
-from parameterized import parameterized
-from torchaudio_unittest.common_utils import (
-    get_sinusoid,
-    load_params,
-    save_wav,
-    skipIfNoExec,
-    TempDirMixin,
-    TestBaseMixin,
-)
+from torchaudio_unittest.common_utils import skipIfNoExec, TempDirMixin, TestBaseMixin
 from torchaudio_unittest.common_utils.kaldi_utils import convert_args, run_kaldi
 
 
@@ -32,25 +24,3 @@ def test_sliding_window_cmn(self):
         command = ["apply-cmvn-sliding"] + convert_args(**kwargs) + ["ark:-", "ark:-"]
         kaldi_result = run_kaldi(command, "ark", tensor)
         self.assert_equal(result, expected=kaldi_result)
-
-
-class KaldiCPUOnly(TempDirMixin, TestBaseMixin):
-    def assert_equal(self, output, *, expected, rtol=None, atol=None):
-        expected = expected.to(dtype=self.dtype, device=self.device)
-        self.assertEqual(output, expected, rtol=rtol, atol=atol)
-
-    @parameterized.expand(load_params("kaldi_test_pitch_args.jsonl"))
-    @skipIfNoExec("compute-kaldi-pitch-feats")
-    def test_pitch_feats(self, kwargs):
-        """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
-        sample_rate = kwargs["sample_rate"]
-        waveform = get_sinusoid(dtype="float32", sample_rate=sample_rate)
-        result = F.compute_kaldi_pitch(waveform[0], **kwargs)
-
-        waveform = get_sinusoid(dtype="int16", sample_rate=sample_rate)
-        wave_file = self.get_temp_path("test.wav")
-        save_wav(wave_file, waveform, sample_rate)
-
-        command = ["compute-kaldi-pitch-feats"] + convert_args(**kwargs) + ["scp:-", "ark:-"]
-        kaldi_result = run_kaldi(command, "scp", wave_file)
-        self.assert_equal(result, expected=kaldi_result)
diff --git a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
index 9d36b52f93..c8afc3f8d1 100644
--- a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
+++ b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -585,18 +585,6 @@ def func(tensor):
         tensor = common_utils.get_whitenoise(sample_rate=44100)
         self._assert_consistency(func, (tensor,))
 
-    @common_utils.skipIfNoKaldi
-    def test_compute_kaldi_pitch(self):
-        if self.dtype != torch.float32 or self.device != torch.device("cpu"):
-            raise unittest.SkipTest("Only float32, cpu is supported.")
-
-        def func(tensor):
-            sample_rate: float = 44100.0
-            return F.compute_kaldi_pitch(tensor, sample_rate)
-
-        tensor = common_utils.get_whitenoise(sample_rate=44100)
-        self._assert_consistency(func, (tensor,))
-
     def test_resample_sinc(self):
         def func(tensor):
             sr1, sr2 = 16000, 8000
diff --git a/test/torchaudio_unittest/io/effector_test.py b/test/torchaudio_unittest/io/effector_test.py
index 88d5683a63..833420c43f 100644
--- a/test/torchaudio_unittest/io/effector_test.py
+++ b/test/torchaudio_unittest/io/effector_test.py
@@ -30,14 +30,17 @@ def test_null(self):
             ("ogg", "flac"),  # flac only supports s16 and s32
             ("ogg", "opus"),  # opus only supports 48k Hz
             ("ogg", "vorbis"),  # vorbis only supports stereo
+            # ("ogg", "vorbis", 44100),
+            # this fails with small descrepancy; 441024 vs 441000
+            # TODO: investigate
             ("wav", None),
             ("wav", "pcm_u8"),
             ("mp3", None),
+            ("mulaw", None, 44100),  # mulaw is encoded without header
         ]
     )
-    def test_formats(self, format, encoder):
+    def test_formats(self, format, encoder, sample_rate=8000):
         """Formats (some with restrictions) just work without an issue in effector"""
-        sample_rate = 8000
 
         effector = AudioEffector(format=format, encoder=encoder)
         original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
@@ -80,3 +83,20 @@ def test_effect(self, effect):
 
         output = effector.apply(original, sample_rate)
         self.assertEqual(original.shape, output.shape)
+
+    def test_resample(self):
+        """Resample option allows to change the sampling rate"""
+        sample_rate = 8000
+        output_sample_rate = 16000
+        num_channels = 3
+
+        effector = AudioEffector(effect="lowpass")
+        original = get_sinusoid(n_channels=num_channels, sample_rate=sample_rate, channels_first=False)
+
+        output = effector.apply(original, sample_rate, output_sample_rate)
+        self.assertEqual(output.shape, [output_sample_rate, num_channels])
+
+        for chunk in effector.stream(
+            original, sample_rate, output_sample_rate=output_sample_rate, frames_per_chunk=output_sample_rate
+        ):
+            self.assertEqual(chunk.shape, [output_sample_rate, num_channels])
diff --git a/test/torchaudio_unittest/io/stream_reader_test.py b/test/torchaudio_unittest/io/stream_reader_test.py
index 57dd7939ac..f17c3a0531 100644
--- a/test/torchaudio_unittest/io/stream_reader_test.py
+++ b/test/torchaudio_unittest/io/stream_reader_test.py
@@ -4,6 +4,7 @@
 import torchaudio
 from parameterized import parameterized, parameterized_class
 from torchaudio_unittest.common_utils import (
+    disabledInCI,
     get_asset_path,
     get_image,
     get_sinusoid,
@@ -1109,68 +1110,105 @@ def test_dup_hw_acel(self):
 
 @_media_source
 class CudaDecoderTest(_MediaSourceMixin, TempDirMixin, TorchaudioTestCase):
-    @skipIfNoHWAccel("h264_cuvid")
-    def test_h264_cuvid(self):
-        """GPU decoder works for H264"""
-        src = self.get_src(get_asset_path("nasa_13013.mp4"))
+    def _test_decode(
+        self,
+        decoder: str,
+        src_path: str,
+        height: int,
+        width: int,
+        ref_num_frames: int,
+        hw_accel=None,
+        decoder_option=None,
+        dtype: torch.dtype = torch.uint8,
+    ):
+        src = self.get_src(get_asset_path(src_path))
         r = StreamReader(src)
-        r.add_video_stream(10, decoder="h264_cuvid")
+        r.add_video_stream(10, decoder=decoder, decoder_option=decoder_option, hw_accel=hw_accel)
 
         num_frames = 0
         for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cpu"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
+            self.assertEqual(chunk.device, torch.device(hw_accel or "cpu"))
+            self.assertEqual(chunk.dtype, dtype)
+            self.assertEqual(chunk.shape, torch.Size([10, 3, height, width]))
             num_frames += chunk.size(0)
-        assert num_frames == 390
+        assert num_frames == ref_num_frames
+
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid(self):
+        """GPU decoder works for H264"""
+        self._test_decode("h264_cuvid", "nasa_13013.mp4", 270, 480, 390)
 
     @skipIfNoHWAccel("h264_cuvid")
     def test_h264_cuvid_hw_accel(self):
         """GPU decoder works for H264 with HW acceleration, and put the frames on CUDA tensor"""
-        src = self.get_src(get_asset_path("nasa_13013.mp4"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="h264_cuvid", hw_accel="cuda")
+        self._test_decode("h264_cuvid", "nasa_13013.mp4", 270, 480, 390, hw_accel="cuda:0")
 
-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cuda:0"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 270, 480]))
-            num_frames += chunk.size(0)
-        assert num_frames == 390
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid_hw_accel_resize(self):
+        """GPU decoder works for H264 with HW acceleration and resize option"""
+        w, h = 240, 136
+        self._test_decode(
+            "h264_cuvid", "nasa_13013.mp4", h, w, 390, hw_accel="cuda:0", decoder_option={"resize": f"{w}x{h}"}
+        )
+
+    @skipIfNoHWAccel("h264_cuvid")
+    def test_h264_cuvid_hw_accel_crop(self):
+        """GPU decoder works for H264 with HW acceleration and crop option"""
+        top, bottom, left, right = 3, 5, 7, 9
+        self._test_decode(
+            "h264_cuvid",
+            "nasa_13013.mp4",
+            262,
+            464,
+            390,
+            hw_accel="cuda:0",
+            decoder_option={"crop": f"{top}x{bottom}x{left}x{right}"},
+        )
 
     @skipIfNoHWAccel("hevc_cuvid")
     def test_hevc_cuvid(self):
         """GPU decoder works for H265/HEVC"""
-        src = self.get_src(get_asset_path("testsrc.hevc"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="hevc_cuvid")
-
-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cpu"))
-            self.assertEqual(chunk.dtype, torch.uint8)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
-            num_frames += chunk.size(0)
-        assert num_frames == 300
+        self._test_decode("hevc_cuvid", "testsrc.hevc", 144, 256, 300)
 
     @skipIfNoHWAccel("hevc_cuvid")
     def test_hevc_cuvid_hw_accel(self):
         """GPU decoder works for H265/HEVC with HW acceleration, and put the frames on CUDA tensor"""
-        src = self.get_src(get_asset_path("testsrc.hevc"))
-        r = StreamReader(src)
-        r.add_video_stream(10, decoder="hevc_cuvid", hw_accel="cuda")
+        self._test_decode("hevc_cuvid", "testsrc.hevc", 144, 256, 300, hw_accel="cuda:0", dtype=torch.int16)
 
-        num_frames = 0
-        for (chunk,) in r.stream():
-            self.assertEqual(chunk.device, torch.device("cuda:0"))
-            self.assertEqual(chunk.dtype, torch.int16)
-            self.assertEqual(chunk.shape, torch.Size([10, 3, 144, 256]))
-            num_frames += chunk.size(0)
-        assert num_frames == 300
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid_hw_accel_resize(self):
+        """GPU decoder works for H265/HEVC with HW acceleration and resize option"""
+        w, h = 128, 64
+        self._test_decode(
+            "hevc_cuvid",
+            "testsrc.hevc",
+            h,
+            w,
+            300,
+            hw_accel="cuda:0",
+            dtype=torch.int16,
+            decoder_option={"resize": f"{w}x{h}"},
+        )
+
+    @skipIfNoHWAccel("hevc_cuvid")
+    def test_hevc_cuvid_hw_accel_crop(self):
+        """GPU decoder works for H265/HEVC with HW acceleration and crop option"""
+        top, bottom, left, right = 3, 5, 7, 9
+        self._test_decode(
+            "hevc_cuvid",
+            "testsrc.hevc",
+            136,
+            240,
+            300,
+            hw_accel="cuda:0",
+            dtype=torch.int16,
+            decoder_option={"crop": f"{top}x{bottom}x{left}x{right}"},
+        )
 
 
 @skipIfNoHWAccel("h264_cuvid")
+# Disabled in CI: https://github.com/pytorch/audio/issues/3376
+@disabledInCI
 class FilterGraphWithCudaAccel(TorchaudioTestCase):
     def test_sclae_cuda_change_size(self):
         """scale_cuda filter can be used when HW accel is on"""
diff --git a/test/torchaudio_unittest/io/stream_writer_test.py b/test/torchaudio_unittest/io/stream_writer_test.py
index 91bd6e1fc7..557985dff8 100644
--- a/test/torchaudio_unittest/io/stream_writer_test.py
+++ b/test/torchaudio_unittest/io/stream_writer_test.py
@@ -389,6 +389,22 @@ def test_audio_num_frames_lossy(self, ext, num_channels, sample_rate):
             return
         self.assertEqual(saved.shape, data.shape)
 
+    def test_g722_sample_rate(self):
+        """Encoding G.722 properly converts sample rate to 16k"""
+        filename = "test.g722"
+        sample_rate = 41000
+        data = get_sinusoid(sample_rate=sample_rate, n_channels=1, channels_first=False)
+
+        # write data
+        dst = self.get_temp_path(filename)
+        w = StreamWriter(dst, format="g722")
+        w.add_audio_stream(sample_rate=sample_rate, num_channels=1)
+        with w.open():
+            w.write_audio_chunk(0, data)
+
+        r = StreamReader(src=self.get_temp_path(filename))
+        self.assertEqual(r.get_src_stream_info(0).sample_rate, 16000)
+
     def test_preserve_fps(self):
         """Decimal point frame rate is properly saved
 
diff --git a/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_test_impl.py b/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_test_impl.py
index 8560564596..5bfabdfbc2 100644
--- a/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_test_impl.py
+++ b/test/torchaudio_unittest/models/rnnt_decoder/rnnt_decoder_test_impl.py
@@ -99,7 +99,7 @@ def test_torchscript_consistency_infer(self):
             self.assertEqual(res, scripted_res)
 
             state = res[1]
-            hypo = res[0][0]
+            hypo = res[0]
 
             scripted_state = scripted_res[1]
-            scripted_hypo = scripted_res[0][0]
+            scripted_hypo = scripted_res[0]
diff --git a/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py b/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py
index e8b0033802..7a3f0ee772 100644
--- a/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py
+++ b/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py
@@ -1,4 +1,5 @@
 import json
+import unittest
 
 import torch
 from parameterized import parameterized
@@ -87,6 +88,7 @@ def _name_func(testcase_func, i, param):
 )
 
 
+@unittest.skip("transformers v4.30 seems to break the weight format. See https://github.com/pytorch/audio/issues/3430")
 @skipIfNoModule("transformers")
 class TestHFIntegration(TorchaudioTestCase):
     """Test the process of importing the models from Hugging Face Transformers
@@ -144,14 +146,6 @@ def _test_import_pretrain(self, original, imported, config):
         hyp = imported.encoder.transformer(x)
         self.assertEqual(ref, hyp)
 
-        # Test get_intermediate_outputs method
-        b, l, e = 16, 3, config["hidden_size"]
-        x = torch.randn(b, l, e)
-        ref = original.encoder(x, output_hidden_states=True).hidden_states
-        hyp = imported.encoder.transformer.get_intermediate_outputs(x)
-        for i in range(len(hyp)):
-            self.assertEqual(ref[i + 1], hyp[i], atol=1e-4, rtol=0.001)
-
     def _test_import_finetune(self, original, imported, config):
         # Aux
         x = torch.randn(3, 10, config["hidden_size"])
@@ -251,14 +245,6 @@ def test_import_pretrain_wavlm(self, config, _):
         hyp = imported.encoder.transformer(x)
         self.assertEqual(ref, hyp)
 
-        # Test get_intermediate_outputs method
-        b, l, e = 16, 3, config["hidden_size"]
-        x = torch.randn(b, l, e)
-        ref = original.encoder(x, output_hidden_states=True).hidden_states
-        hyp = imported.encoder.transformer.get_intermediate_outputs(x)
-        for i in range(len(hyp)):
-            self.assertEqual(ref[i + 1], hyp[i], atol=1e-4, rtol=0.001)
-
     def _test_recreate(self, imported, reloaded, config):
         # FeatureExtractor
         x = torch.randn(3, 1024)
diff --git a/test/torchaudio_unittest/prototype/conformer_wav2vec2_test.py b/test/torchaudio_unittest/prototype/conformer_wav2vec2_test.py
index 4e207a296c..c49e75bf09 100644
--- a/test/torchaudio_unittest/prototype/conformer_wav2vec2_test.py
+++ b/test/torchaudio_unittest/prototype/conformer_wav2vec2_test.py
@@ -5,7 +5,7 @@
     conformer_wav2vec2_pretrain_base,
     conformer_wav2vec2_pretrain_large,
 )
-from torchaudio_unittest.common_utils import nested_params, skipIfNoCuda, torch_script, TorchaudioTestCase
+from torchaudio_unittest.common_utils import disabledInCI, nested_params, skipIfNoCuda, torch_script, TorchaudioTestCase
 
 
 class TestConformerWav2Vec2(TorchaudioTestCase):
@@ -33,6 +33,8 @@ def test_cpu_smoke_test(self, dtype):
 
     @parameterized.expand([(torch.float32,), (torch.float64,)])
     @skipIfNoCuda
+    # Disabled in CI: https://github.com/pytorch/audio/issues/3376
+    @disabledInCI
     def test_cuda_smoke_test(self, dtype):
         model = conformer_wav2vec2_base()
         self._smoke_test(model, torch.device("cuda"), dtype)
@@ -50,6 +52,8 @@ def test_pretrain_cpu_smoke_test(self, model, dtype):
         [torch.float32, torch.float64],
     )
     @skipIfNoCuda
+    # Disabled in CI: https://github.com/pytorch/audio/issues/3376
+    @disabledInCI
     def test_pretrain_cuda_smoke_test(self, model, dtype):
         model = model()
         self._smoke_test(model, torch.device("cuda"), dtype)
diff --git a/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cpu_test.py b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cpu_test.py
new file mode 100644
index 0000000000..76123bcc59
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cpu_test.py
@@ -0,0 +1,7 @@
+from torchaudio_unittest.common_utils import PytorchTestCase
+
+from .librosa_compatibility_test_impl import Functional
+
+
+class TestFunctionalCPU(Functional, PytorchTestCase):
+    device = "cpu"
diff --git a/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cuda_test.py b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cuda_test.py
new file mode 100644
index 0000000000..373f80238e
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_cuda_test.py
@@ -0,0 +1,8 @@
+from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
+
+from .librosa_compatibility_test_impl import Functional
+
+
+@skipIfNoCuda
+class TestFunctionalCUDA(Functional, PytorchTestCase):
+    device = "cuda"
diff --git a/test/torchaudio_unittest/prototype/functional/librosa_compatibility_test_impl.py b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_test_impl.py
new file mode 100644
index 0000000000..c850e104e8
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/functional/librosa_compatibility_test_impl.py
@@ -0,0 +1,62 @@
+import unittest
+
+import torch
+import torchaudio.prototype.functional as F
+from torchaudio._internal.module_utils import is_module_available
+
+LIBROSA_AVAILABLE = is_module_available("librosa")
+
+if LIBROSA_AVAILABLE:
+    import librosa
+    import numpy as np
+
+
+from torchaudio_unittest.common_utils import TestBaseMixin
+
+
+@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
+class Functional(TestBaseMixin):
+    """Test suite for functions in `functional` module."""
+
+    dtype = torch.float64
+
+    def test_chroma_filterbank(self):
+        sample_rate = 16_000
+        n_stft = 400
+        n_chroma = 12
+        tuning = 0.0
+        ctroct = 5.0
+        octwidth = 2.0
+        norm = 2
+        base_c = True
+
+        # NOTE: difference in convention with librosa.
+        # Whereas librosa expects users to supply the full count of FFT frequency bins,
+        # TorchAudio expects users to supply the count with redundant bins, i.e. those in the upper half of the
+        # frequency range, removed. This is consistent with other TorchAudio filter bank functions.
+        n_freqs = n_stft // 2 + 1
+
+        torchaudio_fbank = F.chroma_filterbank(
+            sample_rate=sample_rate,
+            n_freqs=n_freqs,
+            n_chroma=n_chroma,
+            tuning=tuning,
+            ctroct=ctroct,
+            octwidth=octwidth,
+            norm=norm,
+            base_c=base_c,
+        )
+
+        librosa_fbank = librosa.filters.chroma(
+            sr=sample_rate,
+            n_fft=n_stft,
+            n_chroma=n_chroma,
+            tuning=tuning,
+            ctroct=ctroct,
+            octwidth=octwidth,
+            norm=norm,
+            base_c=True,
+            dtype=np.float32,
+        )
+
+        self.assertEqual(torchaudio_fbank, librosa_fbank.T)
diff --git a/test/torchaudio_unittest/prototype/ssl_model_test.py b/test/torchaudio_unittest/prototype/ssl_model_test.py
index 608f9a28ee..2f162d7434 100644
--- a/test/torchaudio_unittest/prototype/ssl_model_test.py
+++ b/test/torchaudio_unittest/prototype/ssl_model_test.py
@@ -32,7 +32,12 @@ def test_cpu_smoke_test(self, model_feature_dim, dtype):
         self._smoke_test(model, feature_dim, torch.device("cpu"), dtype)
 
     @nested_params(
-        [(conformer_wav2vec2_base, 64), (conformer_wav2vec2_pretrain_base, 64), (emformer_hubert_base, 80)],
+        [
+            (conformer_wav2vec2_base, 64),
+            # Skip since failing see issue: https://github.com/pytorch/audio/issues/3376
+            # (conformer_wav2vec2_pretrain_base, 64),
+            (emformer_hubert_base, 80),
+        ],
         [torch.float32, torch.float64],
     )
     @skipIfNoCuda
diff --git a/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py b/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py
index 679b88938a..d42956cac0 100644
--- a/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py
+++ b/test/torchaudio_unittest/prototype/transforms/autograd_test_impl.py
@@ -44,3 +44,19 @@ def test_barkscale(self):
             get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
         )
         self.assert_grad(transform, [spec])
+
+    def test_chroma_spectrogram(self):
+        sample_rate = 8000
+        transform = T.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    def test_chroma_scale(self):
+        sample_rate = 8000
+        n_fft = 400
+        n_chroma = 12
+        transform = T.ChromaScale(sample_rate=sample_rate, n_freqs=n_fft // 2 + 1, n_chroma=n_chroma)
+        waveform = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1
+        )
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
diff --git a/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py b/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py
index 9674f9a77d..3c052c1ec7 100644
--- a/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py
+++ b/test/torchaudio_unittest/prototype/transforms/batch_consistency_test.py
@@ -2,7 +2,6 @@
 
 import torch
 import torchaudio.prototype.transforms as T
-import torchaudio.transforms as transforms
 from torchaudio_unittest.common_utils import TorchaudioTestCase
 
 
@@ -40,3 +39,20 @@ def test_batch_InverseBarkScale(self):
         # Because InverseBarkScale runs SGD on randomly initialized values so they do not yield
         # exactly same result. For this reason, tolerance is very relaxed here.
         self.assert_batch_consistency(transform, bark_spec, atol=1.0, rtol=1e-5)
+
+    def test_batch_chroma_scale(self):
+        n_freqs = 201
+        specgram = torch.randn(3, 2, n_freqs, 256)
+
+        atol = 1e-6 if os.name == "nt" else 1e-8
+        transform = T.ChromaScale(16000, n_freqs, n_chroma=12)
+
+        self.assert_batch_consistency(transform, specgram, atol=atol)
+
+    def test_batch_chroma_spectrogram(self):
+        waveform = torch.randn(3, 2, 4000)
+
+        atol = 1e-6 if os.name == "nt" else 1e-8
+        transform = T.ChromaSpectrogram(16000, 512, n_chroma=12)
+
+        self.assert_batch_consistency(transform, waveform, atol=atol)
diff --git a/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cpu_test.py b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cpu_test.py
new file mode 100644
index 0000000000..c39bc766a6
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cpu_test.py
@@ -0,0 +1,9 @@
+import torch
+from torchaudio_unittest.common_utils import PytorchTestCase
+
+from .librosa_compatibility_test_impl import TransformsTestBase
+
+
+class TestTransforms(TransformsTestBase, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
diff --git a/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cuda_test.py b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cuda_test.py
new file mode 100644
index 0000000000..a82c72ab29
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_cuda_test.py
@@ -0,0 +1,10 @@
+import torch
+from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
+
+from .librosa_compatibility_test_impl import TransformsTestBase
+
+
+@skipIfNoCuda
+class TestTransforms(TransformsTestBase, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cuda")
diff --git a/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_test_impl.py b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_test_impl.py
new file mode 100644
index 0000000000..bf55b74fe1
--- /dev/null
+++ b/test/torchaudio_unittest/prototype/transforms/librosa_compatibility_test_impl.py
@@ -0,0 +1,50 @@
+import unittest
+
+import torch
+import torchaudio.prototype.transforms as T
+from parameterized import param
+from torchaudio._internal.module_utils import is_module_available
+from torchaudio_unittest.common_utils import get_sinusoid, nested_params, TestBaseMixin
+
+LIBROSA_AVAILABLE = is_module_available("librosa")
+
+if LIBROSA_AVAILABLE:
+    import librosa
+
+
+@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
+class TransformsTestBase(TestBaseMixin):
+    @nested_params(
+        [
+            param(n_fft=400, hop_length=200, n_chroma=13),
+            param(n_fft=600, hop_length=100, n_chroma=24),
+            param(n_fft=200, hop_length=50, n_chroma=12),
+        ],
+    )
+    def test_chroma_spectrogram(self, n_fft, hop_length, n_chroma):
+        sample_rate = 16000
+        waveform = get_sinusoid(
+            sample_rate=sample_rate,
+            n_channels=1,
+        ).to(self.device, self.dtype)
+
+        expected = librosa.feature.chroma_stft(
+            y=waveform[0].cpu().numpy(),
+            sr=sample_rate,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            n_chroma=n_chroma,
+            norm=None,
+            pad_mode="reflect",
+            tuning=0.0,
+        )
+        result = T.ChromaSpectrogram(
+            sample_rate=sample_rate,
+            window_fn=torch.hann_window,
+            hop_length=hop_length,
+            n_chroma=n_chroma,
+            n_fft=n_fft,
+            tuning=0.0,
+        ).to(self.device, self.dtype)(waveform)[0]
+
+        self.assertEqual(result, expected, atol=5e-4, rtol=1e-4)
diff --git a/test/torchaudio_unittest/sox_effect/smoke_test.py b/test/torchaudio_unittest/sox_effect/smoke_test.py
index a5de940a50..30befd54ab 100644
--- a/test/torchaudio_unittest/sox_effect/smoke_test.py
+++ b/test/torchaudio_unittest/sox_effect/smoke_test.py
@@ -54,24 +54,3 @@ def test_apply_effects_file(self, args):
         _found, _sr = sox_effects.apply_effects_file(
             input_path, effects, normalize=False, channels_first=channels_first
         )
-
-    @parameterized.expand(
-        load_params("sox_effect_test_args.jsonl"),
-        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
-    )
-    def test_apply_effects_fileobj(self, args):
-        """`apply_effects_file` should return identical data as sox command"""
-        dtype = "int32"
-        channels_first = True
-        effects = args["effects"]
-        num_channels = args.get("num_channels", 2)
-        input_sr = args.get("input_sample_rate", 8000)
-
-        input_path = self.get_temp_path("input.wav")
-        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
-        save_wav(input_path, data, input_sr, channels_first=channels_first)
-
-        with open(input_path, "rb") as fileobj:
-            _found, _sr = sox_effects.apply_effects_file(
-                fileobj, effects, normalize=False, channels_first=channels_first
-            )
diff --git a/test/torchaudio_unittest/sox_effect/sox_effect_test.py b/test/torchaudio_unittest/sox_effect/sox_effect_test.py
index be6b646617..2099505502 100644
--- a/test/torchaudio_unittest/sox_effect/sox_effect_test.py
+++ b/test/torchaudio_unittest/sox_effect/sox_effect_test.py
@@ -1,20 +1,14 @@
-import io
 import itertools
-import tarfile
 from pathlib import Path
 
 from parameterized import parameterized
 from torchaudio import sox_effects
-from torchaudio._internal import module_utils as _mod_utils
 from torchaudio_unittest.common_utils import (
     get_sinusoid,
     get_wav_data,
-    HttpServerMixin,
     load_wav,
     PytorchTestCase,
     save_wav,
-    skipIfNoExec,
-    skipIfNoModule,
     skipIfNoSox,
     sox_utils,
     TempDirMixin,
@@ -23,10 +17,6 @@
 from .common import load_params, name_func
 
 
-if _mod_utils.is_module_available("requests"):
-    import requests
-
-
 @skipIfNoSox
 class TestSoxEffects(PytorchTestCase):
     def test_init(self):
@@ -241,136 +231,3 @@ def test_vorbis(self, sample_rate, num_channels):
 
         assert sr == expected_sr
         self.assertEqual(found, expected)
-
-
-@skipIfNoExec("sox")
-@skipIfNoSox
-class TestFileObject(TempDirMixin, PytorchTestCase):
-    @parameterized.expand(
-        [
-            ("wav", None),
-            ("flac", 0),
-            ("flac", 5),
-            ("flac", 8),
-            ("vorbis", -1),
-            ("vorbis", 10),
-            ("amb", None),
-        ]
-    )
-    def test_fileobj(self, ext, compression):
-        """Applying effects via file object works"""
-        sample_rate = 16000
-        channels_first = True
-        effects = [["band", "300", "10"]]
-        input_path = self.get_temp_path(f"input.{ext}")
-        reference_path = self.get_temp_path("reference.wav")
-
-        sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
-        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
-        expected, expected_sr = load_wav(reference_path)
-
-        with open(input_path, "rb") as fileobj:
-            found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
-        save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
-        assert sr == expected_sr
-        self.assertEqual(found, expected)
-
-    @parameterized.expand(
-        [
-            ("wav", None),
-            ("flac", 0),
-            ("flac", 5),
-            ("flac", 8),
-            ("vorbis", -1),
-            ("vorbis", 10),
-            ("amb", None),
-        ]
-    )
-    def test_bytesio(self, ext, compression):
-        """Applying effects via BytesIO object works"""
-        sample_rate = 16000
-        channels_first = True
-        effects = [["band", "300", "10"]]
-        input_path = self.get_temp_path(f"input.{ext}")
-        reference_path = self.get_temp_path("reference.wav")
-
-        sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
-        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
-        expected, expected_sr = load_wav(reference_path)
-
-        with open(input_path, "rb") as file_:
-            fileobj = io.BytesIO(file_.read())
-        found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
-        save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
-        assert sr == expected_sr
-        self.assertEqual(found, expected)
-
-    @parameterized.expand(
-        [
-            ("wav", None),
-            ("flac", 0),
-            ("flac", 5),
-            ("flac", 8),
-            ("vorbis", -1),
-            ("vorbis", 10),
-            ("amb", None),
-        ]
-    )
-    def test_tarfile(self, ext, compression):
-        """Applying effects to compressed audio via file-like file works"""
-        sample_rate = 16000
-        channels_first = True
-        effects = [["band", "300", "10"]]
-        audio_file = f"input.{ext}"
-
-        input_path = self.get_temp_path(audio_file)
-        reference_path = self.get_temp_path("reference.wav")
-        archive_path = self.get_temp_path("archive.tar.gz")
-
-        sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
-        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
-        expected, expected_sr = load_wav(reference_path)
-
-        with tarfile.TarFile(archive_path, "w") as tarobj:
-            tarobj.add(input_path, arcname=audio_file)
-        with tarfile.TarFile(archive_path, "r") as tarobj:
-            fileobj = tarobj.extractfile(audio_file)
-            found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
-        save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
-        assert sr == expected_sr
-        self.assertEqual(found, expected)
-
-
-@skipIfNoSox
-@skipIfNoExec("sox")
-@skipIfNoModule("requests")
-class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
-    @parameterized.expand(
-        [
-            ("wav", None),
-            ("flac", 0),
-            ("flac", 5),
-            ("flac", 8),
-            ("vorbis", -1),
-            ("vorbis", 10),
-            ("amb", None),
-        ]
-    )
-    def test_requests(self, ext, compression):
-        sample_rate = 16000
-        channels_first = True
-        effects = [["band", "300", "10"]]
-        audio_file = f"input.{ext}"
-        input_path = self.get_temp_path(audio_file)
-        reference_path = self.get_temp_path("reference.wav")
-
-        sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
-        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
-        expected, expected_sr = load_wav(reference_path)
-
-        url = self.get_url(audio_file)
-        with requests.get(url, stream=True) as resp:
-            found, sr = sox_effects.apply_effects_file(resp.raw, effects, channels_first=channels_first)
-        save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
-        assert sr == expected_sr
-        self.assertEqual(found, expected)
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
deleted file mode 100644
index f2dfa0a786..0000000000
--- a/third_party/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
-
-file(MAKE_DIRECTORY install/include)
-file(MAKE_DIRECTORY install/lib)
-
-################################################################################
-# sox
-################################################################################
-if (BUILD_SOX)
-  add_subdirectory(sox)
-endif()
-
-################################################################################
-# kaldi
-################################################################################
-if (BUILD_KALDI)
-  add_subdirectory(kaldi)
-endif()
diff --git a/third_party/ffmpeg/CMakeLists.txt b/third_party/ffmpeg/CMakeLists.txt
new file mode 100644
index 0000000000..f2feb09c61
--- /dev/null
+++ b/third_party/ffmpeg/CMakeLists.txt
@@ -0,0 +1,91 @@
+################################################################################
+# This file defines the following FFmpeg libraries using pre-built binaries.
+
+add_library(ffmpeg4 INTERFACE)
+add_library(ffmpeg ALIAS ffmpeg4)
+
+################################################################################
+
+include(FetchContent)
+
+set(base_url https://pytorch.s3.amazonaws.com/torchaudio/ffmpeg)
+
+if (APPLE)
+  if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
+    FetchContent_Declare(
+      f4
+      URL ${base_url}/2023-07-06/macos_arm64/4.1.8.tar.gz
+      URL_HASH SHA256=a44b8152b7f204ce5050fc7f6fd2bbbafe7ae4e45f03e135f3b45dd9a08f404e
+      )
+  elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+    FetchContent_Declare(
+      f4
+      URL ${base_url}/2023-07-06/macos_x86_64/4.1.8.tar.gz
+      URL_HASH SHA256=392d5af0b24535bfc69d6244e7595e5f07117b93d94505d0a4b34c82ae479f48
+      )
+  else ()
+    message(
+      FATAL_ERROR
+      "CPU architecture ${CMAKE_SYSTEM_PROCESSOR} is not currently supported. If you do not need FFmpeg integration, then setting USE_FFMPEG=0 will bypass the issue.")
+  endif()
+elseif (UNIX)
+  if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
+    FetchContent_Declare(
+      f4
+      URL ${base_url}/2023-07-06/linux_aarch64/4.1.8.tar.gz
+      URL_HASH SHA256=aae0b713040e30ceebe0d0bc82353d3d9054055c7af8a4f4abc1766015ab7681
+    )
+  elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+    FetchContent_Declare(
+      f4
+      URL ${base_url}/2023-07-06/linux_x86_64/4.1.8.tar.gz
+      URL_HASH SHA256=52e53b8857739bdd54f9d8541e22569b57f6c6f16504ee83963c2ed3e7061a23
+      )
+  else ()
+    # Possible case ppc64le (though it's not officially supported.)
+    message(
+      FATAL_ERROR
+      "CPU architecture ${CMAKE_SYSTEM_PROCESSOR} is not currently supported. If you do not need FFmpeg integration, then setting USE_FFMPEG=0 will bypass the issue.")
+  endif()
+elseif(MSVC)
+  FetchContent_Declare(
+    f4
+    URL ${base_url}/2023-07-06/windows/4.1.8.tar.gz
+    URL_HASH SHA256=c45cd36e0575490f97ace07365bb67c5e1cbe9f3e6a4272d035c19348df96790
+    )
+endif()
+
+FetchContent_MakeAvailable(f4)
+target_include_directories(ffmpeg4 INTERFACE ${f4_SOURCE_DIR}/include)
+
+if(APPLE)
+  target_link_libraries(
+    ffmpeg4
+    INTERFACE
+    ${f4_SOURCE_DIR}/lib/libavutil.56.dylib
+    ${f4_SOURCE_DIR}/lib/libavcodec.58.dylib
+    ${f4_SOURCE_DIR}/lib/libavformat.58.dylib
+    ${f4_SOURCE_DIR}/lib/libavdevice.58.dylib
+    ${f4_SOURCE_DIR}/lib/libavfilter.7.dylib
+    )
+elseif (UNIX)
+  target_link_libraries(
+    ffmpeg4
+    INTERFACE
+    ${f4_SOURCE_DIR}/lib/libavutil.so.56
+    ${f4_SOURCE_DIR}/lib/libavcodec.so.58
+    ${f4_SOURCE_DIR}/lib/libavformat.so.58
+    ${f4_SOURCE_DIR}/lib/libavdevice.so.58
+    ${f4_SOURCE_DIR}/lib/libavfilter.so.7
+    )
+elseif(MSVC)
+  target_link_libraries(
+    ffmpeg4
+    INTERFACE
+    ${f4_SOURCE_DIR}/bin/avutil.lib
+    ${f4_SOURCE_DIR}/bin/avcodec.lib
+    ${f4_SOURCE_DIR}/bin/avformat.lib
+    ${f4_SOURCE_DIR}/bin/avdevice.lib
+    ${f4_SOURCE_DIR}/bin/avfilter.lib
+    )
+endif()
diff --git a/third_party/kaldi/CMakeLists.txt b/third_party/kaldi/CMakeLists.txt
deleted file mode 100644
index 75fca8ed99..0000000000
--- a/third_party/kaldi/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(KALDI_REPO ${CMAKE_CURRENT_SOURCE_DIR}/submodule)
-
-if (NOT EXISTS ${KALDI_REPO}/src/base/version.h)
-# Apply custom patch
-execute_process(
-  WORKING_DIRECTORY ${KALDI_REPO}
-  COMMAND "git" "checkout" "."
-  )
-execute_process(
-  WORKING_DIRECTORY ${KALDI_REPO}
-  COMMAND git apply ../kaldi.patch
-  )
-# Update the version string
-execute_process(
-  WORKING_DIRECTORY ${KALDI_REPO}/src/base
-  COMMAND bash get_version.sh
-  )
-endif()
-
-set(KALDI_SOURCES
-  src/matrix/kaldi-vector.cc
-  src/matrix/kaldi-matrix.cc
-  submodule/src/base/kaldi-error.cc
-  submodule/src/base/kaldi-math.cc
-  submodule/src/feat/feature-functions.cc
-  submodule/src/feat/pitch-functions.cc
-  submodule/src/feat/resample.cc
-  )
-
-add_library(kaldi STATIC ${KALDI_SOURCES})
-target_include_directories(kaldi PUBLIC src submodule/src)
-target_include_directories(kaldi PRIVATE ${TORCH_INCLUDE_DIRS})
diff --git a/third_party/kaldi/README.md b/third_party/kaldi/README.md
deleted file mode 100644
index f981674601..0000000000
--- a/third_party/kaldi/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Custom Kaldi build
-
-This directory contains original Kaldi repository (as submodule), [the custom implementation of Kaldi's vector/matrix](./src) and the build script.
-
-We use the custom build process so that the resulting library only contains what torchaudio needs.
-We use the custom vector/matrix implementation so that we can use the same BLAS library that PyTorch is compiled with, and so that we can (hopefully, in future) take advantage of other PyTorch features (such as differentiability and GPU support). The down side of this approach is that it adds a lot of overhead compared to the original Kaldi (operator dispatch and element-wise processing, which PyTorch is not efficient at). We can improve this gradually, and if you are interested in helping, please let us know by opening an issue.
diff --git a/third_party/kaldi/kaldi.patch b/third_party/kaldi/kaldi.patch
deleted file mode 100644
index 40667bced8..0000000000
--- a/third_party/kaldi/kaldi.patch
+++ /dev/null
@@ -1,76 +0,0 @@
-diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
-index 7ebf4f853..c15b288b2 100644
---- a/src/base/kaldi-types.h
-+++ b/src/base/kaldi-types.h
-@@ -41,6 +41,7 @@ typedef float   BaseFloat;
- 
- // for discussion on what to do if you need compile kaldi
- // without OpenFST, see the bottom of this this file
-+/*
- #include <fst/types.h>
- 
- namespace kaldi {
-@@ -53,10 +54,10 @@ namespace kaldi {
-   typedef float   float32;
-   typedef double double64;
- }  // end namespace kaldi
-+*/
- 
- // In a theoretical case you decide compile Kaldi without the OpenFST
- // comment the previous namespace statement and uncomment the following
--/*
- namespace kaldi {
-   typedef int8_t   int8;
-   typedef int16_t  int16;
-@@ -70,6 +71,5 @@ namespace kaldi {
-   typedef float    float32;
-   typedef double   double64;
- }  // end namespace kaldi
--*/
- 
- #endif  // KALDI_BASE_KALDI_TYPES_H_
-diff --git a/src/matrix/matrix-lib.h b/src/matrix/matrix-lib.h
-index b6059b06c..4fb9e1b16 100644
---- a/src/matrix/matrix-lib.h
-+++ b/src/matrix/matrix-lib.h
-@@ -25,14 +25,14 @@
- #include "base/kaldi-common.h"
- #include "matrix/kaldi-vector.h"
- #include "matrix/kaldi-matrix.h"
--#include "matrix/sp-matrix.h"
--#include "matrix/tp-matrix.h"
-+// #include "matrix/sp-matrix.h"
-+// #include "matrix/tp-matrix.h"
- #include "matrix/matrix-functions.h"
- #include "matrix/srfft.h"
- #include "matrix/compressed-matrix.h"
--#include "matrix/sparse-matrix.h"
-+// #include "matrix/sparse-matrix.h"
- #include "matrix/optimization.h"
--#include "matrix/numpy-array.h"
-+// #include "matrix/numpy-array.h"
- 
- #endif
- 
-diff --git a/src/util/common-utils.h b/src/util/common-utils.h
-index cfb0c255c..48d199e97 100644
---- a/src/util/common-utils.h
-+++ b/src/util/common-utils.h
-@@ -21,11 +21,11 @@
- 
- #include "base/kaldi-common.h"
- #include "util/parse-options.h"
--#include "util/kaldi-io.h"
--#include "util/simple-io-funcs.h"
--#include "util/kaldi-holder.h"
--#include "util/kaldi-table.h"
--#include "util/table-types.h"
--#include "util/text-utils.h"
-+// #include "util/kaldi-io.h"
-+// #include "util/simple-io-funcs.h"
-+// #include "util/kaldi-holder.h"
-+// #include "util/kaldi-table.h"
-+// #include "util/table-types.h"
-+// #include "util/text-utils.h"
- 
- #endif  // KALDI_UTIL_COMMON_UTILS_H_
diff --git a/third_party/kaldi/src/matrix/kaldi-matrix.cc b/third_party/kaldi/src/matrix/kaldi-matrix.cc
deleted file mode 100644
index a89c3809c9..0000000000
--- a/third_party/kaldi/src/matrix/kaldi-matrix.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "matrix/kaldi-matrix.h"
-#include <torch/torch.h>
-
-namespace {
-
-template <typename Real>
-void assert_matrix_shape(const torch::Tensor& tensor_);
-
-template <>
-void assert_matrix_shape<float>(const torch::Tensor& tensor_) {
-  TORCH_INTERNAL_ASSERT(tensor_.ndimension() == 2);
-  TORCH_INTERNAL_ASSERT(tensor_.dtype() == torch::kFloat32);
-  TORCH_CHECK(tensor_.device().is_cpu(), "Input tensor has to be on CPU.");
-}
-
-template <>
-void assert_matrix_shape<double>(const torch::Tensor& tensor_) {
-  TORCH_INTERNAL_ASSERT(tensor_.ndimension() == 2);
-  TORCH_INTERNAL_ASSERT(tensor_.dtype() == torch::kFloat64);
-  TORCH_CHECK(tensor_.device().is_cpu(), "Input tensor has to be on CPU.");
-}
-
-} // namespace
-
-namespace kaldi {
-
-template <typename Real>
-MatrixBase<Real>::MatrixBase(torch::Tensor tensor) : tensor_(tensor) {
-  assert_matrix_shape<Real>(tensor_);
-};
-
-template class Matrix<float>;
-template class Matrix<double>;
-template class MatrixBase<float>;
-template class MatrixBase<double>;
-template class SubMatrix<float>;
-template class SubMatrix<double>;
-
-} // namespace kaldi
diff --git a/third_party/kaldi/src/matrix/kaldi-matrix.h b/third_party/kaldi/src/matrix/kaldi-matrix.h
deleted file mode 100644
index f64828b84f..0000000000
--- a/third_party/kaldi/src/matrix/kaldi-matrix.h
+++ /dev/null
@@ -1,178 +0,0 @@
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h
-
-#ifndef KALDI_MATRIX_KALDI_MATRIX_H_
-#define KALDI_MATRIX_KALDI_MATRIX_H_
-
-#include <torch/torch.h>
-#include "matrix/kaldi-vector.h"
-#include "matrix/matrix-common.h"
-
-using namespace torch::indexing;
-
-namespace kaldi {
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L44-L48
-template <typename Real>
-class MatrixBase {
- public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // PyTorch-specific items
-  ////////////////////////////////////////////////////////////////////////////////
-  torch::Tensor tensor_;
-  /// Construct VectorBase which is an interface to an existing torch::Tensor
-  /// object.
-  MatrixBase(torch::Tensor tensor);
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Kaldi-compatible items
-  ////////////////////////////////////////////////////////////////////////////////
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L62-L63
-  inline MatrixIndexT NumRows() const {
-    return tensor_.size(0);
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L65-L66
-  inline MatrixIndexT NumCols() const {
-    return tensor_.size(1);
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L177-L178
-  void CopyColFromVec(const VectorBase<Real>& v, const MatrixIndexT col) {
-    tensor_.index_put_({Slice(), col}, v.tensor_);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L99-L107
-  inline Real& operator()(MatrixIndexT r, MatrixIndexT c) {
-    // CPU only
-    return tensor_.accessor<Real, 2>()[r][c];
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L112-L120
-  inline const Real operator()(MatrixIndexT r, MatrixIndexT c) const {
-    return tensor_.index({Slice(r), Slice(c)}).item().template to<Real>();
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L138-L141
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.cc#L859-L898
-  template <typename OtherReal>
-  void CopyFromMat(
-      const MatrixBase<OtherReal>& M,
-      MatrixTransposeType trans = kNoTrans) {
-    auto src = M.tensor_;
-    if (trans == kTrans)
-      src = src.transpose(1, 0);
-    tensor_.index_put_({Slice(), Slice()}, src);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L186-L191
-  inline const SubVector<Real> Row(MatrixIndexT i) const {
-    return SubVector<Real>(*this, i);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L208-L211
-  inline SubMatrix<Real> RowRange(
-      const MatrixIndexT row_offset,
-      const MatrixIndexT num_rows) const {
-    return SubMatrix<Real>(*this, row_offset, num_rows, 0, NumCols());
-  }
-
- protected:
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L749-L753
-  explicit MatrixBase() : tensor_(torch::empty({0, 0})) {
-    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
-  }
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L781-L784
-template <typename Real>
-class Matrix : public MatrixBase<Real> {
- public:
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L786-L787
-  Matrix() : MatrixBase<Real>() {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L789-L793
-  Matrix(
-      const MatrixIndexT r,
-      const MatrixIndexT c,
-      MatrixResizeType resize_type = kSetZero,
-      MatrixStrideType stride_type = kDefaultStride)
-      : MatrixBase<Real>() {
-    Resize(r, c, resize_type, stride_type);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L808-L811
-  explicit Matrix(
-      const MatrixBase<Real>& M,
-      MatrixTransposeType trans = kNoTrans)
-      : MatrixBase<Real>(
-            trans == kNoTrans ? M.tensor_ : M.tensor_.transpose(1, 0)) {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L816-L819
-  template <typename OtherReal>
-  explicit Matrix(
-      const MatrixBase<OtherReal>& M,
-      MatrixTransposeType trans = kNoTrans)
-      : MatrixBase<Real>(
-            trans == kNoTrans ? M.tensor_ : M.tensor_.transpose(1, 0)) {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L859-L874
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.cc#L817-L857
-  void Resize(
-      const MatrixIndexT r,
-      const MatrixIndexT c,
-      MatrixResizeType resize_type = kSetZero,
-      MatrixStrideType stride_type = kDefaultStride) {
-    auto& tensor_ = MatrixBase<Real>::tensor_;
-    switch (resize_type) {
-      case kSetZero:
-        tensor_.resize_({r, c}).zero_();
-        break;
-      case kUndefined:
-        tensor_.resize_({r, c});
-        break;
-      case kCopyData:
-        auto tmp = tensor_;
-        auto tmp_rows = tmp.size(0);
-        auto tmp_cols = tmp.size(1);
-        tensor_.resize_({r, c}).zero_();
-        auto rows = Slice(None, r < tmp_rows ? r : tmp_rows);
-        auto cols = Slice(None, c < tmp_cols ? c : tmp_cols);
-        tensor_.index_put_({rows, cols}, tmp.index({rows, cols}));
-        break;
-    }
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L876-L883
-  Matrix<Real>& operator=(const MatrixBase<Real>& other) {
-    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
-        MatrixBase<Real>::NumCols() != other.NumCols())
-      Resize(other.NumRows(), other.NumCols(), kUndefined);
-    MatrixBase<Real>::CopyFromMat(other);
-    return *this;
-  }
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L940-L948
-template <typename Real>
-class SubMatrix : public MatrixBase<Real> {
- public:
-  SubMatrix(
-      const MatrixBase<Real>& T,
-      const MatrixIndexT ro, // row offset, 0 < ro < NumRows()
-      const MatrixIndexT r, // number of rows, r > 0
-      const MatrixIndexT co, // column offset, 0 < co < NumCols()
-      const MatrixIndexT c) // number of columns, c > 0
-      : MatrixBase<Real>(
-            T.tensor_.index({Slice(ro, ro + r), Slice(co, co + c)})) {}
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-matrix.h#L1059-L1060
-template <typename Real>
-std::ostream& operator<<(std::ostream& Out, const MatrixBase<Real>& M) {
-  Out << M.tensor_;
-  return Out;
-}
-
-} // namespace kaldi
-
-#endif
diff --git a/third_party/kaldi/src/matrix/kaldi-vector.cc b/third_party/kaldi/src/matrix/kaldi-vector.cc
deleted file mode 100644
index df59f13a36..0000000000
--- a/third_party/kaldi/src/matrix/kaldi-vector.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "matrix/kaldi-vector.h"
-#include <torch/torch.h>
-#include "matrix/kaldi-matrix.h"
-
-namespace {
-
-template <typename Real>
-void assert_vector_shape(const torch::Tensor& tensor_);
-
-template <>
-void assert_vector_shape<float>(const torch::Tensor& tensor_) {
-  TORCH_INTERNAL_ASSERT(tensor_.ndimension() == 1);
-  TORCH_INTERNAL_ASSERT(tensor_.dtype() == torch::kFloat32);
-  TORCH_CHECK(tensor_.device().is_cpu(), "Input tensor has to be on CPU.");
-}
-
-template <>
-void assert_vector_shape<double>(const torch::Tensor& tensor_) {
-  TORCH_INTERNAL_ASSERT(tensor_.ndimension() == 1);
-  TORCH_INTERNAL_ASSERT(tensor_.dtype() == torch::kFloat64);
-  TORCH_CHECK(tensor_.device().is_cpu(), "Input tensor has to be on CPU.");
-}
-
-} // namespace
-
-namespace kaldi {
-
-template <typename Real>
-VectorBase<Real>::VectorBase(torch::Tensor tensor)
-    : tensor_(tensor), data_(tensor.data_ptr<Real>()) {
-  assert_vector_shape<Real>(tensor_);
-};
-
-template <typename Real>
-VectorBase<Real>::VectorBase() : VectorBase<Real>(torch::empty({0})) {}
-
-template class Vector<float>;
-template class Vector<double>;
-template class VectorBase<float>;
-template class VectorBase<double>;
-
-} // namespace kaldi
diff --git a/third_party/kaldi/src/matrix/kaldi-vector.h b/third_party/kaldi/src/matrix/kaldi-vector.h
deleted file mode 100644
index 620f3676d3..0000000000
--- a/third_party/kaldi/src/matrix/kaldi-vector.h
+++ /dev/null
@@ -1,313 +0,0 @@
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h
-
-#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
-#define KALDI_MATRIX_KALDI_VECTOR_H_
-
-#include <torch/torch.h>
-#include "matrix/matrix-common.h"
-
-using namespace torch::indexing;
-
-namespace kaldi {
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L36-L40
-template <typename Real>
-class VectorBase {
- public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // PyTorch-specific things
-  ////////////////////////////////////////////////////////////////////////////////
-  torch::Tensor tensor_;
-
-  /// Construct VectorBase which is an interface to an existing torch::Tensor
-  /// object.
-  VectorBase(torch::Tensor tensor);
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Kaldi-compatible methods
-  ////////////////////////////////////////////////////////////////////////////////
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L42-L43
-  void SetZero() {
-    Set(0);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L48-L49
-  void Set(Real f) {
-    tensor_.index_put_({"..."}, f);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L62-L63
-  inline MatrixIndexT Dim() const {
-    return tensor_.numel();
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L68-L69
-  inline Real* Data() {
-    return data_;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L71-L72
-  inline const Real* Data() const {
-    return data_;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L74-L79
-  inline Real operator()(MatrixIndexT i) const {
-    return data_[i];
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L81-L86
-  inline Real& operator()(MatrixIndexT i) {
-    return tensor_.accessor<Real, 1>()[i];
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L88-L95
-  SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
-    return SubVector<Real>(*this, o, l);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L97-L105
-  const SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l)
-      const {
-    return SubVector<Real>(*this, o, l);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L107-L108
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L226-L233
-  void CopyFromVec(const VectorBase<Real>& v) {
-    TORCH_INTERNAL_ASSERT(tensor_.sizes() == v.tensor_.sizes());
-    tensor_.copy_(v.tensor_);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L137-L139
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L816-L832
-  void ApplyFloor(Real floor_val, MatrixIndexT* floored_count = nullptr) {
-    auto index = tensor_ < floor_val;
-    auto tmp = tensor_.index_put_({index}, floor_val);
-    if (floored_count) {
-      *floored_count = index.sum().item().template to<MatrixIndexT>();
-    }
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L164-L165
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L449-L479
-  void ApplyPow(Real power) {
-    tensor_.pow_(power);
-    TORCH_INTERNAL_ASSERT(!tensor_.isnan().sum().item().template to<int32_t>());
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L181-L184
-  template <typename OtherReal>
-  void AddVec(const Real alpha, const VectorBase<OtherReal>& v) {
-    tensor_ += alpha * v.tensor_;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L186-L187
-  void AddVec2(const Real alpha, const VectorBase<Real>& v) {
-    tensor_ += alpha * (v.tensor_.square());
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L196-L198
-  void AddMatVec(
-      const Real alpha,
-      const MatrixBase<Real>& M,
-      const MatrixTransposeType trans,
-      const VectorBase<Real>& v,
-      const Real beta) { // **beta previously defaulted to 0.0**
-    auto mat = M.tensor_;
-    if (trans == kTrans) {
-      mat = mat.transpose(1, 0);
-    }
-    tensor_.addmv_(mat, v.tensor_, beta, alpha);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L221-L222
-  void MulElements(const VectorBase<Real>& v) {
-    tensor_ *= v.tensor_;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L233-L234
-  void Add(Real c) {
-    tensor_ += c;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L236-L239
-  void AddVecVec(
-      Real alpha,
-      const VectorBase<Real>& v,
-      const VectorBase<Real>& r,
-      Real beta) {
-    tensor_ = beta * tensor_ + alpha * v.tensor_ * r.tensor_;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L246-L247
-  void Scale(Real alpha) {
-    tensor_ *= alpha;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L305-L306
-  Real Min() const {
-    if (tensor_.numel()) {
-      return tensor_.min().item().template to<Real>();
-    }
-    return std::numeric_limits<Real>::infinity();
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L308-L310
-  Real Min(MatrixIndexT* index) const {
-    TORCH_INTERNAL_ASSERT(tensor_.numel());
-    torch::Tensor value, ind;
-    std::tie(value, ind) = tensor_.min(0);
-    *index = ind.item().to<MatrixIndexT>();
-    return value.item().to<Real>();
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L312-L313
-  Real Sum() const {
-    return tensor_.sum().item().template to<Real>();
-  };
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L320-L321
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L718-L736
-  void AddRowSumMat(Real alpha, const MatrixBase<Real>& M, Real beta = 1.0) {
-    Vector<Real> ones(M.NumRows());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kTrans, ones, beta);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L323-L324
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L738-L757
-  void AddColSumMat(Real alpha, const MatrixBase<Real>& M, Real beta = 1.0) {
-    Vector<Real> ones(M.NumCols());
-    ones.Set(1.0);
-    this->AddMatVec(alpha, M, kNoTrans, ones, beta);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L326-L330
-  void AddDiagMat2(
-      Real alpha,
-      const MatrixBase<Real>& M,
-      MatrixTransposeType trans = kNoTrans,
-      Real beta = 1.0) {
-    auto mat = M.tensor_;
-    if (trans == kNoTrans) {
-      tensor_ =
-          beta * tensor_ + torch::diag(torch::mm(mat, mat.transpose(1, 0)));
-    } else {
-      tensor_ =
-          beta * tensor_ + torch::diag(torch::mm(mat.transpose(1, 0), mat));
-    }
-  }
-
- protected:
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L362-L365
-  explicit VectorBase();
-
-  //  https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L378-L379
-  Real* data_;
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L382
-  KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L385-L390
-template <typename Real>
-class Vector : public VectorBase<Real> {
- public:
-  ////////////////////////////////////////////////////////////////////////////////
-  // PyTorch-compatibility things
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Construct VectorBase which is an interface to an existing torch::Tensor
-  /// object.
-  Vector(torch::Tensor tensor) : VectorBase<Real>(tensor){};
-
-  ////////////////////////////////////////////////////////////////////////////////
-  // Kaldi-compatible methods
-  ////////////////////////////////////////////////////////////////////////////////
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L392-L393
-  Vector() : VectorBase<Real>(){};
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L395-L399
-  explicit Vector(const MatrixIndexT s, MatrixResizeType resize_type = kSetZero)
-      : VectorBase<Real>() {
-    Resize(s, resize_type);
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L406-L410
-  // Note: unlike the original implementation, this is "explicit".
-  explicit Vector(const Vector<Real>& v)
-      : VectorBase<Real>(v.tensor_.clone()) {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L412-L416
-  explicit Vector(const VectorBase<Real>& v)
-      : VectorBase<Real>(v.tensor_.clone()) {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L434-L435
-  void Swap(Vector<Real>* other) {
-    auto tmp = VectorBase<Real>::tensor_;
-    this->tensor_ = other->tensor_;
-    other->tensor_ = tmp;
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L444-L451
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.cc#L189-L223
-  void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero) {
-    auto& tensor_ = this->tensor_;
-    switch (resize_type) {
-      case kSetZero:
-        tensor_.resize_({length}).zero_();
-        break;
-      case kUndefined:
-        tensor_.resize_({length});
-        break;
-      case kCopyData:
-        auto tmp = tensor_;
-        auto tmp_numel = tensor_.numel();
-        tensor_.resize_({length}).zero_();
-        auto numel = Slice(length < tmp_numel ? length : tmp_numel);
-        tensor_.index_put_({numel}, tmp.index({numel}));
-        break;
-    }
-    // data_ptr<Real>() causes compiler error
-    this->data_ = static_cast<Real*>(tensor_.data_ptr());
-  }
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L463-L468
-  Vector<Real>& operator=(const VectorBase<Real>& other) {
-    Resize(other.Dim(), kUndefined);
-    this->CopyFromVec(other);
-    return *this;
-  }
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L482-L485
-template <typename Real>
-class SubVector : public VectorBase<Real> {
- public:
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L487-L499
-  SubVector(
-      const VectorBase<Real>& t,
-      const MatrixIndexT origin,
-      const MatrixIndexT length)
-      : VectorBase<Real>(t.tensor_.index({Slice(origin, origin + length)})) {}
-
-  // https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L524-L528
-  SubVector(const MatrixBase<Real>& matrix, MatrixIndexT row)
-      : VectorBase<Real>(matrix.tensor_.index({row})) {}
-};
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L540-L543
-template <typename Real>
-std::ostream& operator<<(std::ostream& out, const VectorBase<Real>& v) {
-  out << v.tensor_;
-  return out;
-}
-
-// https://github.com/kaldi-asr/kaldi/blob/7fb716aa0f56480af31514c7e362db5c9f787fd4/src/matrix/kaldi-vector.h#L573-L575
-template <typename Real>
-Real VecVec(const VectorBase<Real>& v1, const VectorBase<Real>& v2) {
-  return torch::dot(v1.tensor_, v2.tensor_).item().template to<Real>();
-}
-
-} // namespace kaldi
-
-#endif
diff --git a/third_party/kaldi/submodule b/third_party/kaldi/submodule
deleted file mode 160000
index 3eea37dd09..0000000000
--- a/third_party/kaldi/submodule
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3eea37dd09b55064e6362216f7e9a60641f29f09
diff --git a/third_party/patches/sox.patch b/third_party/patches/sox.patch
deleted file mode 100644
index fe8df945c0..0000000000
--- a/third_party/patches/sox.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-See https://github.com/pytorch/audio/pull/1297
-diff -ru sox/src/formats.c sox/src/formats.c
---- sox/src/formats.c	2014-10-26 19:55:50.000000000 -0700
-+++ sox/src/formats.c	2021-02-22 16:01:02.833144070 -0800
-@@ -333,6 +333,10 @@
-   assert(ft);
-   if (!ft->fp)
-     return sox_false;
--  fstat(fileno((FILE*)ft->fp), &st);
-+  int fd = fileno((FILE*)ft->fp);
-+  if (fd < 0)
-+    return sox_false;
-+  if (fstat(fd, &st) < 0)
-+    return sox_false;
-   return ((st.st_mode & S_IFMT) == S_IFREG);
- }
diff --git a/third_party/sox/CMakeLists.txt b/third_party/sox/CMakeLists.txt
index 50e5cc9156..c4f5dd8931 100644
--- a/third_party/sox/CMakeLists.txt
+++ b/third_party/sox/CMakeLists.txt
@@ -75,7 +75,7 @@ ExternalProject_Add(flac
   URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
   URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
   PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs --disable-xmms-plugin
   DOWNLOAD_NO_PROGRESS ON
   LOG_DOWNLOAD ON
   LOG_UPDATE ON
@@ -193,7 +193,7 @@ ExternalProject_Add(sox
   DOWNLOAD_DIR ${ARCHIVE_DIR}
   URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
   URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
-  PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
   CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
   BUILD_BYPRODUCTS ${SOX_LIBRARIES}
   DOWNLOAD_NO_PROGRESS ON
diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py
index 81e36d3804..685ccf05de 100644
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -34,10 +34,9 @@ def _get_build(var, default=False):
 
 
 _BUILD_SOX = False if platform.system() == "Windows" else _get_build("BUILD_SOX", True)
-_BUILD_KALDI = False if platform.system() == "Windows" else _get_build("BUILD_KALDI", True)
 _BUILD_RIR = _get_build("BUILD_RIR", True)
 _BUILD_RNNT = _get_build("BUILD_RNNT", True)
-_USE_FFMPEG = _get_build("USE_FFMPEG", False)
+_USE_FFMPEG = _get_build("USE_FFMPEG", True)
 _USE_ROCM = _get_build("USE_ROCM", torch.backends.cuda.is_built() and torch.version.hip is not None)
 _USE_CUDA = _get_build("USE_CUDA", torch.backends.cuda.is_built() and torch.version.hip is None)
 _BUILD_ALIGN = _get_build("BUILD_ALIGN", True)
@@ -116,7 +115,6 @@ def build_extension(self, ext):
             "-DCMAKE_VERBOSE_MAKEFILE=ON",
             f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
             f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
-            f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",
             f"-DBUILD_RIR:BOOL={'ON' if _BUILD_RIR else 'OFF'}",
             f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}",
             f"-DBUILD_ALIGN:BOOL={'ON' if _BUILD_ALIGN else 'OFF'}",
diff --git a/torchaudio/_extension/__init__.py b/torchaudio/_extension/__init__.py
index f65b982228..951e381ae2 100644
--- a/torchaudio/_extension/__init__.py
+++ b/torchaudio/_extension/__init__.py
@@ -14,12 +14,10 @@
 # Builder uses it for debugging purpose, so we export it.
 # https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
 __all__ = [
-    "fail_if_no_kaldi",
     "fail_if_no_sox",
     "fail_if_no_ffmpeg",
     "_check_cuda_version",
     "_IS_TORCHAUDIO_EXT_AVAILABLE",
-    "_IS_KALDI_AVAILABLE",
     "_IS_RIR_AVAILABLE",
     "_SOX_INITIALIZED",
     "_FFMPEG_INITIALIZED",
@@ -34,11 +32,10 @@
 # In case of an error, we do not catch the failure as it suggests there is something
 # wrong with the installation.
 _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
-# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
+# RIR features are implemented in _torchaudio extension, but they can be individually
 # turned on/off at build time. Available means that _torchaudio is loaded properly, and
-# Kaldi or RIR features are found there.
+# RIR features are found there.
 _IS_RIR_AVAILABLE = False
-_IS_KALDI_AVAILABLE = False
 _IS_ALIGN_AVAILABLE = False
 if _IS_TORCHAUDIO_EXT_AVAILABLE:
     _load_lib("libtorchaudio")
@@ -47,7 +44,6 @@
 
     _check_cuda_version()
     _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
-    _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
     _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
 
 
@@ -77,13 +73,6 @@
         _LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
 
 
-fail_if_no_kaldi = (
-    no_op
-    if _IS_KALDI_AVAILABLE
-    else fail_with_message(
-        "requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
-    )
-)
 fail_if_no_sox = (
     no_op
     if _SOX_INITIALIZED
diff --git a/torchaudio/_extension/utils.py b/torchaudio/_extension/utils.py
index 5490385d34..30ef2e4a35 100644
--- a/torchaudio/_extension/utils.py
+++ b/torchaudio/_extension/utils.py
@@ -67,7 +67,7 @@ def _init_sox():
     _load_lib("libtorchaudio_sox")
     import torchaudio.lib._torchaudio_sox  # noqa
 
-    torch.ops.torchaudio.sox_utils_set_verbosity(0)
+    torchaudio.lib._torchaudio_sox.set_verbosity(0)
 
     import atexit
 
diff --git a/torchaudio/_internal/module_utils.py b/torchaudio/_internal/module_utils.py
index ed648e57f2..d5ab186b0c 100644
--- a/torchaudio/_internal/module_utils.py
+++ b/torchaudio/_internal/module_utils.py
@@ -40,22 +40,21 @@ def wrapped(*args, **kwargs):
     return decorator
 
 
-def deprecated(direction: str, version: Optional[str] = None):
+def deprecated(direction: str, version: Optional[str] = None, remove: bool = False):
     """Decorator to add deprecation message
 
     Args:
         direction (str): Migration steps to be given to users.
         version (str or int): The version when the object will be removed
+        remove (bool): If enabled, append future removal message.
     """
 
     def decorator(func):
         @wraps(func)
         def wrapped(*args, **kwargs):
-            message = (
-                f"{func.__module__}.{func.__name__} has been deprecated "
-                f'and will be removed from {"future" if version is None else version} release. '
-                f"{direction}"
-            )
+            message = f"{func.__module__}.{func.__name__} has been deprecated. {direction}"
+            if remove:
+                message += f' It will be removed from {"future" if version is None else version} release. '
             warnings.warn(message, stacklevel=2)
             return func(*args, **kwargs)
 
diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py
index 8b540b5954..30b5cecfb0 100644
--- a/torchaudio/backend/sox_io_backend.py
+++ b/torchaudio/backend/sox_io_backend.py
@@ -1,10 +1,8 @@
 import os
-import warnings
 from typing import Optional, Tuple
 
 import torch
 import torchaudio
-from torchaudio.utils.sox_utils import get_buffer_size
 
 from .common import AudioMetaData
 
@@ -14,10 +12,6 @@ def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
     raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
 
 
-def _fail_info_fileobj(fileobj, format: Optional[str], buffer_size: int) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
-
-
 # Note: need to comply TorchScript syntax -- need annotation and no f-string
 def _fail_load(
     filepath: str,
@@ -30,30 +24,14 @@ def _fail_load(
     raise RuntimeError("Failed to load audio from {}".format(filepath))
 
 
-def _fail_load_fileobj(fileobj, *args, **kwargs):
-    raise RuntimeError(f"Failed to load audio from {fileobj}")
-
-
 if torchaudio._extension._FFMPEG_INITIALIZED:
     import torchaudio.io._compat as _compat
 
     _fallback_info = _compat.info_audio
-    _fallback_info_fileobj = _compat.info_audio_fileobj
     _fallback_load = _compat.load_audio
-    _fallback_load_fileobj = _compat.load_audio_fileobj
 else:
     _fallback_info = _fail_info
-    _fallback_info_fileobj = _fail_info_fileobj
     _fallback_load = _fail_load
-    _fallback_load_fileobj = _fail_load_fileobj
-
-
-_deprecation_message = (
-    "File-like object support in sox_io backend is deprecated, "
-    "and will be removed in v2.1. "
-    "See https://github.com/pytorch/audio/issues/2950 for the detail."
-    "Please migrate to the new dispatcher, or use soundfile backend."
-)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -64,24 +42,8 @@ def info(
     """Get signal information of an audio file.
 
     Args:
-        filepath (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted;
-
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-
-            Note:
-
-                  * When the input type is file-like object, this function cannot
-                    get the correct length (``num_samples``) for certain formats,
-                    such as ``vorbis``.
-                    In this case, the value of ``num_samples`` is ``0``.
-                  * This argument is intentionally annotated as ``str`` only due to
-                    TorchScript compiler compatibility.
+        filepath (str):
+            Source of audio data.
 
         format (str or None, optional):
             Override the format detection with the given format.
@@ -93,21 +55,7 @@ def info(
     """
     if not torch.jit.is_scripting():
         if hasattr(filepath, "read"):
-            # Special case for Backward compatibility
-            # v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
-            # file-like objects are not necessarily fallback-able
-            # when they are not seekable.
-            # The previous libsox-based implementation required `format="mp3"`
-            # because internally libsox does not auto-detect the format.
-            # For the special BC for mp3, we handle mp3 differently.
-            buffer_size = get_buffer_size()
-            if format == "mp3":
-                return _fallback_info_fileobj(filepath, format, buffer_size)
-            warnings.warn(_deprecation_message)
-            sinfo = torchaudio.lib._torchaudio_sox.get_info_fileobj(filepath, format)
-            if sinfo is not None:
-                return AudioMetaData(*sinfo)
-            return _fallback_info_fileobj(filepath, format, buffer_size)
+            raise RuntimeError("sox_io backend does not support file-like object.")
         filepath = os.fspath(filepath)
     sinfo = torch.ops.torchaudio.sox_io_get_info(filepath, format)
     if sinfo is not None:
@@ -171,18 +119,7 @@ def load(
        For these formats, this function always returns ``float32`` Tensor with values.
 
     Args:
-        filepath (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted;
-
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-
-            Note: This argument is intentionally annotated as ``str`` only due to
-            TorchScript compiler compatibility.
+        filepath (path-like object): Source of audio data.
         frame_offset (int):
             Number of frames to skip before start reading data.
         num_frames (int, optional):
@@ -214,39 +151,7 @@ def load(
     """
     if not torch.jit.is_scripting():
         if hasattr(filepath, "read"):
-            # Special case for Backward compatibility
-            # v0.11 -> v0.12, mp3 handling is moved to FFmpeg.
-            # file-like objects are not necessarily fallback-able
-            # when they are not seekable.
-            # The previous libsox-based implementation required `format="mp3"`
-            # because internally libsox does not auto-detect the format.
-            # For the special BC for mp3, we handle mp3 differently.
-            buffer_size = get_buffer_size()
-            if format == "mp3":
-                return _fallback_load_fileobj(
-                    filepath,
-                    frame_offset,
-                    num_frames,
-                    normalize,
-                    channels_first,
-                    format,
-                    buffer_size,
-                )
-            warnings.warn(_deprecation_message)
-            ret = torchaudio.lib._torchaudio_sox.load_audio_fileobj(
-                filepath, frame_offset, num_frames, normalize, channels_first, format
-            )
-            if ret is not None:
-                return ret
-            return _fallback_load_fileobj(
-                filepath,
-                frame_offset,
-                num_frames,
-                normalize,
-                channels_first,
-                format,
-                buffer_size,
-            )
+            raise RuntimeError("sox_io backend does not support file-like object.")
         filepath = os.fspath(filepath)
     ret = torch.ops.torchaudio.sox_io_load_audio_file(
         filepath, frame_offset, num_frames, normalize, channels_first, format
@@ -270,9 +175,7 @@ def save(
     """Save audio data to file.
 
     Args:
-        filepath (str or pathlib.Path): Path to save file.
-            This function also handles ``pathlib.Path`` objects, but is annotated
-            as ``str`` for TorchScript compiler compatibility.
+        filepath (path-like object): Path to save file.
         src (torch.Tensor): Audio data to save. must be 2D tensor.
         sample_rate (int): sampling rate
         channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
@@ -413,18 +316,7 @@ def save(
     """
     if not torch.jit.is_scripting():
         if hasattr(filepath, "write"):
-            warnings.warn(_deprecation_message)
-            torchaudio.lib._torchaudio_sox.save_audio_fileobj(
-                filepath,
-                src,
-                sample_rate,
-                channels_first,
-                compression,
-                format,
-                encoding,
-                bits_per_sample,
-            )
-            return
+            raise RuntimeError("sox_io backend does not handle file-like object.")
         filepath = os.fspath(filepath)
     torch.ops.torchaudio.sox_io_save_audio_file(
         filepath,
diff --git a/torchaudio/csrc/CMakeLists.txt b/torchaudio/csrc/CMakeLists.txt
index 88c8736d01..fc0c549493 100644
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
@@ -76,12 +76,6 @@ if(USE_CUDA)
     )
 endif()
 
-if(BUILD_KALDI)
-  list(APPEND additional_libs kaldi)
-  list(APPEND sources kaldi.cpp)
-  list(APPEND compile_definitions INCLUDE_KALDI)
-endif()
-
 if(OpenMP_CXX_FOUND)
   list(
     APPEND
diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt
index e3445265b5..8c08704bbc 100644
--- a/torchaudio/csrc/ffmpeg/CMakeLists.txt
+++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt
@@ -1,10 +1,3 @@
-message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}")
-find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil)
-add_library(ffmpeg INTERFACE)
-target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}")
-target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}")
-
-
 set(
   sources
   ffmpeg.cpp
diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
index 66bd222c05..7822b30392 100644
--- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp
+++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp
@@ -5,8 +5,7 @@
 #include <string>
 #include <vector>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 
 ////////////////////////////////////////////////////////////////////////////////
 // AVDictionary
@@ -147,5 +146,4 @@ void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) {
 AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p)
     : Wrapper<AVCodecParameters, AVCodecParametersDeleter>(p) {}
 
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp
index 797f078349..1a1e40b011 100644
--- a/torchaudio/csrc/ffmpeg/filter_graph.cpp
+++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp
@@ -1,8 +1,7 @@
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
 #include <stdexcept>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 
 namespace {
 AVFilterGraph* get_filter_graph() {
@@ -222,5 +221,4 @@ int FilterGraph::get_frame(AVFrame* pOutputFrame) {
   return av_buffersink_get_frame(buffersink_ctx, pOutputFrame);
 }
 
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
index 7ccc7bd0bf..95db01fcec 100644
--- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
+++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -3,8 +3,7 @@
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 namespace {
 
 std::map<std::string, std::tuple<int64_t, int64_t, int64_t>> get_versions() {
@@ -354,5 +353,4 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
 }
 
 } // namespace
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h b/torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h
index 889521fbf7..7afa760355 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/buffer/unchunked_buffer.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <torch/torch.h>
+#include <torch/types.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/typedefs.h>
 #include <deque>
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
index 99e33e8367..406f4e91bf 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp
@@ -398,15 +398,14 @@ torch::Tensor NV12Converter::convert(const AVFrame* src) {
 
 #ifdef USE_CUDA
 
+CudaImageConverterBase::CudaImageConverterBase(const torch::Device& device)
+    : device(device) {}
+
 ////////////////////////////////////////////////////////////////////////////////
 // NV12 CUDA
 ////////////////////////////////////////////////////////////////////////////////
-NV12CudaConverter::NV12CudaConverter(int h, int w, const torch::Device& device)
-    : ImageConverterBase(h, w, 3),
-      tmp_uv(get_image_buffer(
-          {1, height / 2, width / 2, 2},
-          device,
-          torch::kUInt8)) {
+NV12CudaConverter::NV12CudaConverter(const torch::Device& device)
+    : CudaImageConverterBase(device) {
   TORCH_WARN_ONCE(
       "The output format NV12 is selected. "
       "This will be implicitly converted to YUV444P, "
@@ -469,8 +468,16 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }
 
 torch::Tensor NV12CudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer =
-      get_image_buffer({1, num_channels, height, width}, tmp_uv.device());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    tmp_uv =
+        get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kUInt8);
+    init = true;
+  }
+
+  torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device);
   convert(src, buffer);
   return buffer;
 }
@@ -478,12 +485,8 @@ torch::Tensor NV12CudaConverter::convert(const AVFrame* src) {
 ////////////////////////////////////////////////////////////////////////////////
 // P010 CUDA
 ////////////////////////////////////////////////////////////////////////////////
-P010CudaConverter::P010CudaConverter(int h, int w, const torch::Device& device)
-    : ImageConverterBase(h, w, 3),
-      tmp_uv(get_image_buffer(
-          {1, height / 2, width / 2, 2},
-          device,
-          torch::kInt16)) {
+P010CudaConverter::P010CudaConverter(const torch::Device& device)
+    : CudaImageConverterBase{device} {
   TORCH_WARN_ONCE(
       "The output format P010 is selected. "
       "This will be implicitly converted to YUV444P, "
@@ -550,8 +553,17 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }
 
 torch::Tensor P010CudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer = get_image_buffer(
-      {1, num_channels, height, width}, tmp_uv.device(), torch::kInt16);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    tmp_uv =
+        get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kInt16);
+    init = true;
+  }
+
+  torch::Tensor buffer =
+      get_image_buffer({1, 3, height, width}, device, torch::kInt16);
   convert(src, buffer);
   return buffer;
 }
@@ -559,11 +571,8 @@ torch::Tensor P010CudaConverter::convert(const AVFrame* src) {
 ////////////////////////////////////////////////////////////////////////////////
 // YUV444P CUDA
 ////////////////////////////////////////////////////////////////////////////////
-YUV444PCudaConverter::YUV444PCudaConverter(
-    int h,
-    int w,
-    const torch::Device& device)
-    : ImageConverterBase(h, w, 3), device(device) {}
+YUV444PCudaConverter::YUV444PCudaConverter(const torch::Device& device)
+    : CudaImageConverterBase(device) {}
 
 void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
@@ -588,7 +597,7 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
       av_get_pix_fmt_name(sw_fmt));
 
   // Write Y plane directly
-  for (int i = 0; i < num_channels; ++i) {
+  for (int i = 0; i < 3; ++i) {
     auto status = cudaMemcpy2D(
         dst.index({0, i}).data_ptr(),
         width,
@@ -603,8 +612,13 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) {
 }
 
 torch::Tensor YUV444PCudaConverter::convert(const AVFrame* src) {
-  torch::Tensor buffer =
-      get_image_buffer({1, num_channels, height, width}, device);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src);
+  if (!init) {
+    height = src->height;
+    width = src->width;
+    init = true;
+  }
+  torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device);
   convert(src, buffer);
   return buffer;
 }
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.h b/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
index d128a2f261..0c178ce500 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.h
@@ -87,29 +87,40 @@ class NV12Converter : public ImageConverterBase {
 
 #ifdef USE_CUDA
 
-class NV12CudaConverter : ImageConverterBase {
-  torch::Tensor tmp_uv;
+// Note:
+// GPU decoders are tricky. They allow to change the resolution as part of
+// decoder option, and the resulting resolution is (seemingly) not retrievable.
+// Therefore, we adopt delayed frame size initialization.
+// For that purpose, we do not inherit from ImageConverterBase.
+struct CudaImageConverterBase {
+  const torch::Device device;
+  bool init = false;
+  int height = -1;
+  int width = -1;
+  explicit CudaImageConverterBase(const torch::Device& device);
+};
+
+class NV12CudaConverter : CudaImageConverterBase {
+  torch::Tensor tmp_uv{};
 
  public:
-  NV12CudaConverter(int height, int width, const torch::Device& device);
+  explicit NV12CudaConverter(const torch::Device& device);
   void convert(const AVFrame* src, torch::Tensor& dst);
   torch::Tensor convert(const AVFrame* src);
 };
 
-class P010CudaConverter : ImageConverterBase {
-  torch::Tensor tmp_uv;
+class P010CudaConverter : CudaImageConverterBase {
+  torch::Tensor tmp_uv{};
 
  public:
-  P010CudaConverter(int height, int width, const torch::Device& device);
+  explicit P010CudaConverter(const torch::Device& device);
   void convert(const AVFrame* src, torch::Tensor& dst);
   torch::Tensor convert(const AVFrame* src);
 };
 
-class YUV444PCudaConverter : ImageConverterBase {
-  const torch::Device device;
-
+class YUV444PCudaConverter : CudaImageConverterBase {
  public:
-  YUV444PCudaConverter(int height, int width, const torch::Device& device);
+  explicit YUV444PCudaConverter(const torch::Device& device);
   void convert(const AVFrame* src, torch::Tensor& dst);
   torch::Tensor convert(const AVFrame* src);
 };
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
index 8caec7cb58..bcff81dc3b 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp
@@ -1,7 +1,6 @@
 #include <torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.h>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 void PacketBuffer::push_packet(AVPacket* packet) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null.");
   AVPacket* p = av_packet_clone(packet);
@@ -18,5 +17,4 @@ std::vector<AVPacketPtr> PacketBuffer::pop_packets() {
 bool PacketBuffer::has_packets() {
   return packets.size() > 0;
 }
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
index 147d0bc2d5..38440e3e33 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp
@@ -400,17 +400,17 @@ std::unique_ptr<IPostDecodeProcess> get_unchunked_cuda_video_process(
     case AV_PIX_FMT_NV12: {
       using C = NV12CudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
     }
     case AV_PIX_FMT_P010: {
       using C = P010CudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
     }
     case AV_PIX_FMT_YUV444P: {
       using C = YUV444PCudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
-          std::move(filter), C{i.height, i.width, device}, B{i.time_base});
+          std::move(filter), C{device}, B{i.time_base});
     }
     case AV_PIX_FMT_P016: {
       TORCH_CHECK(
@@ -519,21 +519,21 @@ std::unique_ptr<IPostDecodeProcess> get_chunked_cuda_video_process(
       using C = NV12CudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
           std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
           B{i.time_base, frames_per_chunk, num_chunks});
     }
     case AV_PIX_FMT_P010: {
       using C = P010CudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
           std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
           B{i.time_base, frames_per_chunk, num_chunks});
     }
     case AV_PIX_FMT_YUV444P: {
       using C = YUV444PCudaConverter;
       return std::make_unique<ProcessImpl<C, B>>(
           std::move(filter),
-          C{i.height, i.width, device},
+          C{device},
           B{i.time_base, frames_per_chunk, num_chunks});
     }
     case AV_PIX_FMT_P016: {
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
index a0bf22a065..2213a4018a 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp
@@ -3,8 +3,7 @@
 #include <stdexcept>
 #include <string_view>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 
 namespace {
 AVCodecContextPtr alloc_codec_context(
@@ -389,5 +388,4 @@ c10::optional<Chunk> StreamProcessor::pop_chunk(KeyType key) {
   return post_processes.at(key)->pop_chunk();
 }
 
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
index 22a281ed4f..5ca3bad073 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/torch.h>
+#include <torch/types.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/post_process.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/typedefs.h>
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
index 0eec327aa5..b8e9d7a9bf 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp
@@ -5,8 +5,7 @@
 #include <stdexcept>
 #include <thread>
 
-namespace torchaudio {
-namespace io {
+namespace torchaudio::io {
 
 using KeyType = StreamProcessor::KeyType;
 
@@ -607,5 +606,4 @@ StreamReaderCustomIO::StreamReaderCustomIO(
     : CustomInput(opaque, buffer_size, read_packet, seek),
       StreamReader(io_ctx, format, option) {}
 
-} // namespace io
-} // namespace torchaudio
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
index 4d509977d7..56123b1048 100644
--- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
+++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
@@ -359,16 +359,23 @@ struct CustomInput {
 
 /// @endcond
 
-/// Construct StreamReader with custom read and seek functions.
 ///
-/// @param opaque Custom data used by read_packet and seek functions.
-/// @param format Specify input format.
-/// @param buffer_size The size of the intermediate buffer, which FFmpeg uses to
-/// pass data to function read_packet.
-/// @param read_packet Custom read function that is called from FFmpeg to
-/// read data from the destination.
-/// @param seek Optional seek function that is used to seek the destination.
-struct StreamReaderCustomIO : private detail::CustomInput, public StreamReader {
+/// A subclass of StreamReader which works with custom read function.
+/// Can be used for decoding media from memory or custom object.
+///
+class StreamReaderCustomIO : private detail::CustomInput, public StreamReader {
+ public:
+  ///
+  /// Construct StreamReader with custom read and seek functions.
+  ///
+  /// @param opaque Custom data used by ``read_packet`` and ``seek`` functions.
+  /// @param format Specify input format.
+  /// @param buffer_size The size of the intermediate buffer, which FFmpeg uses
+  /// to pass data to function read_packet.
+  /// @param read_packet Custom read function that is called from FFmpeg to
+  /// read data from the destination.
+  /// @param seek Optional seek function that is used to seek the destination.
+  /// @param option Custom option passed when initializing format context.
   StreamReaderCustomIO(
       void* opaque,
       const c10::optional<std::string>& format,
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
index 7ed0c9b1ae..44bd811e1b 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
+++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp
@@ -278,6 +278,20 @@ int get_enc_sr(
     int src_sample_rate,
     const c10::optional<int>& encoder_sample_rate,
     const AVCodec* codec) {
+  // G.722 only supports 16000 Hz, but it does not list the sample rate in
+  // supported_samplerates so we hard code it here.
+  if (codec->id == AV_CODEC_ID_ADPCM_G722) {
+    if (encoder_sample_rate) {
+      auto val = encoder_sample_rate.value();
+      TORCH_CHECK(
+          val == 16'000,
+          codec->name,
+          " does not support sample rate ",
+          val,
+          ". Supported values are; 16000.");
+    }
+    return 16'000;
+  }
   if (encoder_sample_rate) {
     const int& encoder_sr = encoder_sample_rate.value();
     TORCH_CHECK(
@@ -664,7 +678,12 @@ FilterGraph get_video_filter_graph(
 
   FilterGraph f;
   f.add_video_src(
-      src_fmt, av_inv_q(src_rate), src_rate, src_width, src_height, {1, 1});
+      is_cuda ? AV_PIX_FMT_CUDA : src_fmt,
+      av_inv_q(src_rate),
+      src_rate,
+      src_width,
+      src_height,
+      {1, 1});
   f.add_video_sink();
   f.add_process(desc);
   f.create_filter();
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
index a3cbec9317..db0b1a74a8 100644
--- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
+++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
@@ -306,17 +306,21 @@ struct CustomOutput {
 
 /// @endcond
 
-/// Construct StreamWriter with custom write and seek functions.
 ///
-/// @param opaque Custom data used by write_packet and seek functions.
-/// @param format Specify output format.
-/// @param buffer_size The size of the intermediate buffer, which FFmpeg uses to
-/// pass data to write_packet function.
-/// @param write_packet Custom write function that is called from FFmpeg to
-/// actually write data to the custom destination.
-/// @param seek Optional seek function that is used to seek the destination.
-struct StreamWriterCustomIO : private detail::CustomOutput,
-                              public StreamWriter {
+/// A subclass of StreamReader which works with custom read function.
+/// Can be used for encoding media into memory or custom object.
+///
+class StreamWriterCustomIO : private detail::CustomOutput, public StreamWriter {
+ public:
+  /// Construct StreamWriterCustomIO with custom write and seek functions.
+  ///
+  /// @param opaque Custom data used by ``write_packet`` and ``seek`` functions.
+  /// @param format Specify output format.
+  /// @param buffer_size The size of the intermediate buffer, which FFmpeg uses
+  /// to pass data to write_packet function.
+  /// @param write_packet Custom write function that is called from FFmpeg to
+  /// actually write data to the custom destination.
+  /// @param seek Optional seek function that is used to seek the destination.
   StreamWriterCustomIO(
       void* opaque,
       const c10::optional<std::string>& format,
diff --git a/torchaudio/csrc/forced_align/cpu/compute.cpp b/torchaudio/csrc/forced_align/cpu/compute.cpp
index da42cf942c..d9f735af47 100644
--- a/torchaudio/csrc/forced_align/cpu/compute.cpp
+++ b/torchaudio/csrc/forced_align/cpu/compute.cpp
@@ -17,8 +17,10 @@ void forced_align_impl(
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   using target_t = typename std::
       conditional<target_scalar_type == torch::kInt, int, int64_t>::type;
-  const auto T = logProbs.size(0);
-  const auto L = targets.size(0);
+  const auto batchIndex =
+      0; // TODO: support batch version and use the real batch index
+  const auto T = logProbs.size(1);
+  const auto L = targets.size(1);
   const auto S = 2 * L + 1;
   torch::Tensor alphas = torch::empty(
                              {2, S},
@@ -27,14 +29,14 @@ void forced_align_impl(
                                  .dtype(logProbs.dtype()))
                              .fill_(kNegInfinity);
   torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
-  auto logProbs_a = logProbs.accessor<scalar_t, 2>();
-  auto targets_a = targets.accessor<target_t, 1>();
-  auto paths_a = paths.accessor<target_t, 1>();
+  auto logProbs_a = logProbs.accessor<scalar_t, 3>();
+  auto targets_a = targets.accessor<target_t, 2>();
+  auto paths_a = paths.accessor<target_t, 2>();
   auto alphas_a = alphas.accessor<scalar_t, 2>();
   auto backPtr_a = backPtr.accessor<int8_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
-    if (targets_a[i] == targets_a[i - 1]) {
+    if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {
       ++R;
     }
   }
@@ -49,20 +51,22 @@ void forced_align_impl(
   auto start = T - (L + R) > 0 ? 0 : 1;
   auto end = (S == 1) ? 1 : 2;
   for (auto i = start; i < end; i++) {
-    auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-    alphas_a[0][i] = logProbs_a[0][labelIdx];
+    auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+    alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx];
   }
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
       if ((start % 2 == 1) &&
-          targets_a[start / 2] != targets_a[start / 2 + 1]) {
+          targets_a[batchIndex][start / 2] !=
+              targets_a[batchIndex][start / 2 + 1]) {
         start = start + 1;
       }
       start = start + 1;
     }
     if (t <= L + R) {
       if (end % 2 == 0 && end < 2 * L &&
-          targets_a[end / 2 - 1] != targets_a[end / 2]) {
+          targets_a[batchIndex][end / 2 - 1] !=
+              targets_a[batchIndex][end / 2]) {
         end = end + 1;
       }
       end = end + 1;
@@ -75,7 +79,7 @@ void forced_align_impl(
     }
     if (start == 0) {
       alphas_a[curIdxOffset][0] =
-          alphas_a[prevIdxOffset][0] + logProbs_a[t][blank];
+          alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
       backPtr_a[t][0] = 0;
       startloop += 1;
     }
@@ -85,13 +89,14 @@ void forced_align_impl(
       auto x1 = alphas_a[prevIdxOffset][i - 1];
       auto x2 = -std::numeric_limits<scalar_t>::infinity();
 
-      auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
+      auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
 
       // In CTC, the optimal path may optionally chose to skip a blank label.
       // x2 represents skipping a letter, and can only happen if we're not
       // currently on a blank_label, and we're not on a repeat letter
       // (i != 1) just ensures we don't access targets[i - 2] if its i < 2
-      if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) {
+      if (i % 2 != 0 && i != 1 &&
+          targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
         x2 = alphas_a[prevIdxOffset][i - 2];
       }
       scalar_t result = 0.0;
@@ -105,7 +110,7 @@ void forced_align_impl(
         result = x0;
         backPtr_a[t][i] = 0;
       }
-      alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx];
+      alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     }
   }
   auto idx1 = (T - 1) % 2;
@@ -113,8 +118,8 @@ void forced_align_impl(
   // path stores the token index for each time step after force alignment.
   auto indexScores = 0;
   for (auto t = T - 1; t > -1; t--) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[ltrIdx / 2];
-    paths_a[t] = lbl_idx;
+    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
+    paths_a[batchIndex][t] = lbl_idx;
     ++indexScores;
     ltrIdx -= backPtr_a[t][ltrIdx];
   }
@@ -142,30 +147,35 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
   TORCH_CHECK(
-      logProbs.dim() != 3,
-      "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.")
+      logProbs.dim() == 3,
+      "log_probs must be 3-D (batch_size, input length, num classes)");
   TORCH_CHECK(
-      targets.dim() != 2,
-      "2-D tensor is not yet supported for targets, please provide 1-D tensor.")
+      targets.dim() == 2, "targets must be 2-D (batch_size, target length,)");
   TORCH_CHECK(
-      logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)");
-  TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)");
-  TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D");
-  TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D");
+      inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      logProbs.size(0) == 1,
+      "The batch dimension for log_probs must be 1 at the current version.")
+  TORCH_CHECK(
+      targets.size(0) == 1,
+      "The batch dimension for targets must be 1 at the current version.")
   TORCH_CHECK(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
   TORCH_CHECK(
-      logProbs.size(0) == at::max(inputLengths).item().toInt(),
+      logProbs.size(1) == at::max(inputLengths).item().toInt(),
       "input length mismatch");
   TORCH_CHECK(
-      targets.size(0) == at::max(targetLengths).item().toInt(),
+      targets.size(1) == at::max(targetLengths).item().toInt(),
       "target length mismatch");
 
-  const auto T = logProbs.size(0);
+  const auto B = logProbs.size(0);
+  const auto T = logProbs.size(1);
   auto paths = torch::zeros(
-      {T},
+      {B, T},
       torch::TensorOptions().device(targets.device()).dtype(targets.dtype()));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logProbs.scalar_type(), "forced_align_impl", [&] {
@@ -180,9 +190,10 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   return std::make_tuple(
       paths,
       logProbs.index(
-          {torch::linspace(
+          {torch::indexing::Slice(),
+           torch::linspace(
                0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())),
-           paths}));
+           paths.index({0})}));
 }
 
 TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
diff --git a/torchaudio/csrc/forced_align/gpu/compute.cu b/torchaudio/csrc/forced_align/gpu/compute.cu
index d869473831..b23d52f1f3 100644
--- a/torchaudio/csrc/forced_align/gpu/compute.cu
+++ b/torchaudio/csrc/forced_align/gpu/compute.cu
@@ -18,9 +18,9 @@ namespace alignment {
 namespace gpu {
 template <typename scalar_t, typename target_t>
 __global__ void falign_cuda_step_kernel(
-    const torch::PackedTensorAccessor32<scalar_t, 2, torch::RestrictPtrTraits>
+    const torch::PackedTensorAccessor32<scalar_t, 3, torch::RestrictPtrTraits>
         logProbs_a,
-    const torch::PackedTensorAccessor32<target_t, 1, torch::RestrictPtrTraits>
+    const torch::PackedTensorAccessor32<target_t, 2, torch::RestrictPtrTraits>
         targets_a,
     const int T,
     const int L,
@@ -36,6 +36,8 @@ __global__ void falign_cuda_step_kernel(
     torch::PackedTensorAccessor32<int8_t, 2, torch::RestrictPtrTraits>
         backPtrBuffer_a) {
   scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
+  const int batchIndex =
+      0; // TODO: support batch version and use the real batch index
   int S = 2 * L + 1;
   int curIdxOffset = (t % 2); // current time step frame for alpha
   int prevIdxOffset = ((t - 1) % 2); // previous time step frame for alpha
@@ -49,8 +51,8 @@ __global__ void falign_cuda_step_kernel(
   __syncthreads();
   if (t == 0) {
     for (unsigned int i = start + threadIdx.x; i < end; i += blockDim.x) {
-      int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-      alphas_a[curIdxOffset][i] = logProbs_a[0][labelIdx];
+      int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+      alphas_a[curIdxOffset][i] = logProbs_a[batchIndex][0][labelIdx];
     }
     return;
   }
@@ -62,7 +64,7 @@ __global__ void falign_cuda_step_kernel(
   threadMax = kNegInfinity;
   if (start == 0 && threadIdx.x == 0) {
     alphas_a[curIdxOffset][0] =
-        alphas_a[prevIdxOffset][0] + logProbs_a[t][blank];
+        alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
     threadMax = max(threadMax, alphas_a[curIdxOffset][0]);
     backPtrBuffer_a[backPtrBufferLen][0] = 0;
   }
@@ -73,8 +75,9 @@ __global__ void falign_cuda_step_kernel(
     scalar_t x0 = alphas_a[prevIdxOffset][i];
     scalar_t x1 = alphas_a[prevIdxOffset][i - 1];
     scalar_t x2 = kNegInfinity;
-    int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2];
-    if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) {
+    int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+    if (i % 2 != 0 && i != 1 &&
+        targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
       x2 = alphas_a[prevIdxOffset][i - 2];
     }
     scalar_t result = 0.0;
@@ -88,7 +91,7 @@ __global__ void falign_cuda_step_kernel(
       result = x0;
       backPtrBuffer_a[backPtrBufferLen][i] = 0;
     }
-    alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx];
+    alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
     threadMax = max(threadMax, alphas_a[curIdxOffset][i]);
   }
   scalar_t maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max());
@@ -113,10 +116,12 @@ void forced_align_impl(
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   using target_t = typename std::
       conditional<target_scalar_type == torch::kInt, int, int64_t>::type;
-  auto paths_a = paths.accessor<target_t, 1>();
-  const int T = logProbs.size(0); // num frames
-  const int N = logProbs.size(1); // alphabet size
-  const int L = targets.size(0); // label length
+  auto paths_a = paths.accessor<target_t, 2>();
+  const int batchIndex =
+      0; // TODO: support batch version and use the real batch index
+  const int T = logProbs.size(1); // num frames
+  const int N = logProbs.size(2); // alphabet size
+  const int L = targets.size(1); // label length
   const int S = 2 * L + 1;
   auto targetsCpu = targets.to(torch::kCPU);
   // backPtrBuffer stores the index offset fthe best path at current position
@@ -144,12 +149,12 @@ void forced_align_impl(
                                  .device(logProbs.device()))
                              .fill_(kNegInfinity);
   // CPU accessors
-  auto targetsCpu_a = targetsCpu.accessor<target_t, 1>();
+  auto targetsCpu_a = targetsCpu.accessor<target_t, 2>();
   auto backPtrCpu_a = backPtrCpu.accessor<int8_t, 2>();
   // count the number of repeats in label
   int R = 0;
   for (int i = 1; i < L; ++i) {
-    if (targetsCpu_a[i] == targetsCpu_a[i - 1]) {
+    if (targetsCpu_a[batchIndex][i] == targetsCpu_a[batchIndex][i - 1]) {
       ++R;
     }
   }
@@ -169,14 +174,16 @@ void forced_align_impl(
     if (t > 0) {
       if (T - t <= L + R) {
         if ((start % 2 == 1) &&
-            (targetsCpu_a[start / 2] != targetsCpu_a[start / 2 + 1])) {
+            (targetsCpu_a[batchIndex][start / 2] !=
+             targetsCpu_a[batchIndex][start / 2 + 1])) {
           start = start + 1;
         }
         start = start + 1;
       }
       if (t <= L + R) {
         if ((end % 2 == 0) && (end < 2 * L) &&
-            (targetsCpu_a[end / 2 - 1] != targetsCpu_a[end / 2])) {
+            (targetsCpu_a[batchIndex][end / 2 - 1] !=
+             targetsCpu_a[batchIndex][end / 2])) {
           end = end + 1;
         }
         end = end + 1;
@@ -184,8 +191,8 @@ void forced_align_impl(
     }
     falign_cuda_step_kernel<scalar_t, target_t>
         <<<1, kNumThreads, 0, defaultStream>>>(
-            logProbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            targets.packed_accessor32<target_t, 1, torch::RestrictPtrTraits>(),
+            logProbs.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(),
+            targets.packed_accessor32<target_t, 2, torch::RestrictPtrTraits>(),
             T,
             L,
             N,
@@ -229,8 +236,9 @@ void forced_align_impl(
       : S - 2;
   int indexScores = 0;
   for (int t = T - 1; t >= 0; --t) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targetsCpu_a[ltrIdx / 2];
-    paths_a[t] = lbl_idx;
+    auto lbl_idx =
+        ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2];
+    paths_a[batchIndex][t] = lbl_idx;
     ++indexScores;
     ltrIdx -= backPtrCpu_a[t][ltrIdx];
   }
@@ -258,30 +266,36 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
   TORCH_CHECK(
-      logProbs.dim() != 3,
-      "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.")
+      logProbs.dim() == 3,
+      "log_probs must be 3-D (batch_size, input length, num classes)");
   TORCH_CHECK(
-      targets.dim() != 2,
-      "2-D tensor is not yet supported for targets, please provide 1-D tensor.")
+      targets.dim() == 2, "targets must be 2-D (batch_size, target length,)");
   TORCH_CHECK(
-      logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)");
-  TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)");
-  TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D");
-  TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D");
+      inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)");
+  TORCH_CHECK(
+      logProbs.size(0) == 1,
+      "The batch dimension for log_probs must be 1 at the current version.")
+  TORCH_CHECK(
+      targets.size(0) == 1,
+      "The batch dimension for targets must be 1 at the current version.")
   TORCH_CHECK(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
   TORCH_CHECK(
-      logProbs.size(0) == at::max(inputLengths).item().toInt(),
+      logProbs.size(1) == at::max(inputLengths).item().toInt(),
       "input length mismatch");
   TORCH_CHECK(
-      targets.size(0) == at::max(targetLengths).item().toInt(),
+      targets.size(1) == at::max(targetLengths).item().toInt(),
       "target length mismatch");
 
-  auto T = logProbs.size(0); // num frames
+  auto B = logProbs.size(0);
+  auto T = logProbs.size(1); // num frames
   auto paths = torch::zeros(
-      {T}, torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype()));
+      {B, T},
+      torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype()));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logProbs.scalar_type(), "forced_align_impl", [&] {
         if (targets.scalar_type() == torch::kInt64) {
@@ -295,9 +309,10 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
   return std::make_tuple(
       paths.to(logProbs.device()),
       logProbs.index(
-          {torch::linspace(
+          {torch::indexing::Slice(),
+           torch::linspace(
                0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())),
-           paths}));
+           paths.index({0})}));
 }
 
 TORCH_LIBRARY_IMPL(torchaudio, CUDA, m) {
diff --git a/torchaudio/csrc/iir_cuda.cu b/torchaudio/csrc/iir_cuda.cu
index be6512f2df..2f6b75b239 100644
--- a/torchaudio/csrc/iir_cuda.cu
+++ b/torchaudio/csrc/iir_cuda.cu
@@ -1,4 +1,5 @@
 #include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <torch/torch.h>
 
 template <typename scalar_t>
@@ -58,6 +59,8 @@ void cuda_lfilter_core_loop(
 
   TORCH_CHECK(in.size(2) + a_flipped.size(1) - 1 == padded_out.size(2));
 
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in));
+
   const dim3 threads(256);
   const dim3 blocks((N * C + threads.x - 1) / threads.x);
 
diff --git a/torchaudio/csrc/kaldi.cpp b/torchaudio/csrc/kaldi.cpp
deleted file mode 100644
index 6f2b36c28f..0000000000
--- a/torchaudio/csrc/kaldi.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <torch/script.h>
-#include "feat/pitch-functions.h"
-
-namespace torchaudio {
-namespace kaldi {
-
-namespace {
-
-torch::Tensor denormalize(const torch::Tensor& t) {
-  auto ret = t;
-  auto pos = t > 0, neg = t < 0;
-  ret.index_put({pos}, t.index({pos}) * 32767);
-  ret.index_put({neg}, t.index({neg}) * 32768);
-  return ret;
-}
-
-torch::Tensor compute_kaldi_pitch(
-    const torch::Tensor& wave,
-    const ::kaldi::PitchExtractionOptions& opts) {
-  ::kaldi::VectorBase<::kaldi::BaseFloat> input(wave);
-  ::kaldi::Matrix<::kaldi::BaseFloat> output;
-  ::kaldi::ComputeKaldiPitch(opts, input, &output);
-  return output.tensor_;
-}
-
-} // namespace
-
-torch::Tensor ComputeKaldiPitch(
-    const torch::Tensor& wave,
-    double sample_frequency,
-    double frame_length,
-    double frame_shift,
-    double min_f0,
-    double max_f0,
-    double soft_min_f0,
-    double penalty_factor,
-    double lowpass_cutoff,
-    double resample_frequency,
-    double delta_pitch,
-    double nccf_ballast,
-    int64_t lowpass_filter_width,
-    int64_t upsample_filter_width,
-    int64_t max_frames_latency,
-    int64_t frames_per_chunk,
-    bool simulate_first_pass_online,
-    int64_t recompute_frame,
-    bool snip_edges) {
-  TORCH_CHECK(wave.ndimension() == 2, "Input tensor must be 2 dimentional.");
-  TORCH_CHECK(wave.device().is_cpu(), "Input tensor must be on CPU.");
-  TORCH_CHECK(
-      wave.dtype() == torch::kFloat32, "Input tensor must be float32 type.");
-
-  ::kaldi::PitchExtractionOptions opts;
-  opts.samp_freq = static_cast<::kaldi::BaseFloat>(sample_frequency);
-  opts.frame_shift_ms = static_cast<::kaldi::BaseFloat>(frame_shift);
-  opts.frame_length_ms = static_cast<::kaldi::BaseFloat>(frame_length);
-  opts.min_f0 = static_cast<::kaldi::BaseFloat>(min_f0);
-  opts.max_f0 = static_cast<::kaldi::BaseFloat>(max_f0);
-  opts.soft_min_f0 = static_cast<::kaldi::BaseFloat>(soft_min_f0);
-  opts.penalty_factor = static_cast<::kaldi::BaseFloat>(penalty_factor);
-  opts.lowpass_cutoff = static_cast<::kaldi::BaseFloat>(lowpass_cutoff);
-  opts.resample_freq = static_cast<::kaldi::BaseFloat>(resample_frequency);
-  opts.delta_pitch = static_cast<::kaldi::BaseFloat>(delta_pitch);
-  opts.lowpass_filter_width = static_cast<::kaldi::int32>(lowpass_filter_width);
-  opts.upsample_filter_width =
-      static_cast<::kaldi::int32>(upsample_filter_width);
-  opts.max_frames_latency = static_cast<::kaldi::int32>(max_frames_latency);
-  opts.frames_per_chunk = static_cast<::kaldi::int32>(frames_per_chunk);
-  opts.simulate_first_pass_online = simulate_first_pass_online;
-  opts.recompute_frame = static_cast<::kaldi::int32>(recompute_frame);
-  opts.snip_edges = snip_edges;
-
-  // Kaldi's float type expects value range of int16 expressed as float
-  torch::Tensor wave_ = denormalize(wave);
-
-  auto batch_size = wave_.size(0);
-  std::vector<torch::Tensor> results(batch_size);
-  at::parallel_for(0, batch_size, 1, [&](int64_t begin, int64_t end) {
-    for (auto i = begin; i < end; ++i) {
-      results[i] = compute_kaldi_pitch(wave_.index({i}), opts);
-    }
-  });
-  return torch::stack(results, 0);
-}
-
-TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def(
-      "torchaudio::kaldi_ComputeKaldiPitch",
-      &torchaudio::kaldi::ComputeKaldiPitch);
-}
-
-} // namespace kaldi
-} // namespace torchaudio
diff --git a/torchaudio/csrc/pybind/pybind.cpp b/torchaudio/csrc/pybind/pybind.cpp
index 9d2d0e35a2..b956deb0e4 100644
--- a/torchaudio/csrc/pybind/pybind.cpp
+++ b/torchaudio/csrc/pybind/pybind.cpp
@@ -5,7 +5,6 @@ namespace torchaudio {
 namespace {
 
 PYBIND11_MODULE(_torchaudio, m) {
-  m.def("is_kaldi_available", &is_kaldi_available, "");
   m.def("is_rir_available", &is_rir_available, "");
   m.def("is_align_available", &is_align_available, "");
   m.def("cuda_version", &cuda_version, "");
diff --git a/torchaudio/csrc/sox/CMakeLists.txt b/torchaudio/csrc/sox/CMakeLists.txt
index e369ecf7af..3391a4fc37 100644
--- a/torchaudio/csrc/sox/CMakeLists.txt
+++ b/torchaudio/csrc/sox/CMakeLists.txt
@@ -15,17 +15,9 @@ torchaudio_library(
   )
 
 if (BUILD_TORCHAUDIO_PYTHON_EXTENSION)
-  set(
-    ext_sources
-    pybind/pybind.cpp
-    pybind/effects.cpp
-    pybind/effects_chain.cpp
-    pybind/io.cpp
-    pybind/utils.cpp
-    )
   torchaudio_extension(
     _torchaudio_sox
-    "${ext_sources}"
+    "pybind/pybind.cpp;"
     ""
     "libtorchaudio_sox"
     ""
diff --git a/torchaudio/csrc/sox/effects.cpp b/torchaudio/csrc/sox/effects.cpp
index 9232d870b1..a159663a10 100644
--- a/torchaudio/csrc/sox/effects.cpp
+++ b/torchaudio/csrc/sox/effects.cpp
@@ -3,11 +3,7 @@
 #include <torchaudio/csrc/sox/effects_chain.h>
 #include <torchaudio/csrc/sox/utils.h>
 
-using namespace torchaudio::sox_utils;
-
-namespace torchaudio {
-namespace sox_effects {
-
+namespace torchaudio::sox {
 namespace {
 
 enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
@@ -58,7 +54,7 @@ auto apply_effects_tensor(
 
   // Create SoxEffectsChain
   const auto dtype = waveform.dtype();
-  torchaudio::sox_effects_chain::SoxEffectsChain chain(
+  SoxEffectsChain chain(
       /*input_encoding=*/get_tensor_encodinginfo(dtype),
       /*output_encoding=*/get_tensor_encodinginfo(dtype));
 
@@ -113,7 +109,7 @@ auto apply_effects_file(
   out_buffer.reserve(sf->signal.length);
 
   // Create and run SoxEffectsChain
-  torchaudio::sox_effects_chain::SoxEffectsChain chain(
+  SoxEffectsChain chain(
       /*input_encoding=*/sf->encoding,
       /*output_encoding=*/get_tensor_encodinginfo(dtype));
 
@@ -138,20 +134,16 @@ auto apply_effects_file(
       tensor, chain.getOutputSampleRate());
 }
 
+namespace {
+
 TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
   m.def(
       "torchaudio::sox_effects_initialize_sox_effects",
-      &torchaudio::sox_effects::initialize_sox_effects);
-  m.def(
-      "torchaudio::sox_effects_shutdown_sox_effects",
-      &torchaudio::sox_effects::shutdown_sox_effects);
-  m.def(
-      "torchaudio::sox_effects_apply_effects_tensor",
-      &torchaudio::sox_effects::apply_effects_tensor);
-  m.def(
-      "torchaudio::sox_effects_apply_effects_file",
-      &torchaudio::sox_effects::apply_effects_file);
+      &initialize_sox_effects);
+  m.def("torchaudio::sox_effects_shutdown_sox_effects", &shutdown_sox_effects);
+  m.def("torchaudio::sox_effects_apply_effects_tensor", &apply_effects_tensor);
+  m.def("torchaudio::sox_effects_apply_effects_file", &apply_effects_file);
 }
 
-} // namespace sox_effects
-} // namespace torchaudio
+} // namespace
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/effects.h b/torchaudio/csrc/sox/effects.h
index bac088ee18..70e59f887f 100644
--- a/torchaudio/csrc/sox/effects.h
+++ b/torchaudio/csrc/sox/effects.h
@@ -4,8 +4,7 @@
 #include <torch/script.h>
 #include <torchaudio/csrc/sox/utils.h>
 
-namespace torchaudio {
-namespace sox_effects {
+namespace torchaudio::sox {
 
 void initialize_sox_effects();
 
@@ -25,7 +24,6 @@ auto apply_effects_file(
     const c10::optional<std::string>& format)
     -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
 
-} // namespace sox_effects
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/effects_chain.cpp b/torchaudio/csrc/sox/effects_chain.cpp
index 8d8fbcc829..81dddada28 100644
--- a/torchaudio/csrc/sox/effects_chain.cpp
+++ b/torchaudio/csrc/sox/effects_chain.cpp
@@ -3,10 +3,8 @@
 #include "c10/util/Exception.h"
 
 using namespace torch::indexing;
-using namespace torchaudio::sox_utils;
 
-namespace torchaudio {
-namespace sox_effects_chain {
+namespace torchaudio::sox {
 
 namespace {
 
@@ -300,5 +298,4 @@ int64_t SoxEffectsChain::getOutputSampleRate() {
   return interm_sig_.rate;
 }
 
-} // namespace sox_effects_chain
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/effects_chain.h b/torchaudio/csrc/sox/effects_chain.h
index c456276ef0..7245447738 100644
--- a/torchaudio/csrc/sox/effects_chain.h
+++ b/torchaudio/csrc/sox/effects_chain.h
@@ -4,8 +4,7 @@
 #include <sox.h>
 #include <torchaudio/csrc/sox/utils.h>
 
-namespace torchaudio {
-namespace sox_effects_chain {
+namespace torchaudio::sox {
 
 // Helper struct to safely close sox_effect_t* pointer returned by
 // sox_create_effect
@@ -57,7 +56,6 @@ class SoxEffectsChain {
   int64_t getOutputSampleRate();
 };
 
-} // namespace sox_effects_chain
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/io.cpp b/torchaudio/csrc/sox/io.cpp
index dd4951ea7c..b8aac89372 100644
--- a/torchaudio/csrc/sox/io.cpp
+++ b/torchaudio/csrc/sox/io.cpp
@@ -5,10 +5,8 @@
 #include <torchaudio/csrc/sox/utils.h>
 
 using namespace torch::indexing;
-using namespace torchaudio::sox_utils;
 
-namespace torchaudio {
-namespace sox_io {
+namespace torchaudio::sox {
 
 c10::optional<MetaDataTuple> get_info_file(
     const std::string& path,
@@ -68,8 +66,7 @@ c10::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
     c10::optional<bool> channels_first,
     const c10::optional<std::string>& format) {
   auto effects = get_effects(frame_offset, num_frames);
-  return torchaudio::sox_effects::apply_effects_file(
-      path, effects, normalize, channels_first, format);
+  return apply_effects_file(path, effects, normalize, channels_first, format);
 }
 
 void save_audio_file(
@@ -123,7 +120,7 @@ void save_audio_file(
       "Error saving audio file: failed to open file ",
       path);
 
-  torchaudio::sox_effects_chain::SoxEffectsChain chain(
+  SoxEffectsChain chain(
       /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
       /*output_encoding=*/sf->encoding);
   chain.addInputTensor(&tensor, sample_rate, channels_first);
@@ -132,14 +129,9 @@ void save_audio_file(
 }
 
 TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::sox_io_get_info", &torchaudio::sox_io::get_info_file);
-  m.def(
-      "torchaudio::sox_io_load_audio_file",
-      &torchaudio::sox_io::load_audio_file);
-  m.def(
-      "torchaudio::sox_io_save_audio_file",
-      &torchaudio::sox_io::save_audio_file);
+  m.def("torchaudio::sox_io_get_info", &get_info_file);
+  m.def("torchaudio::sox_io_load_audio_file", &load_audio_file);
+  m.def("torchaudio::sox_io_save_audio_file", &save_audio_file);
 }
 
-} // namespace sox_io
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/io.h b/torchaudio/csrc/sox/io.h
index a1f4c8a5bc..7ef84e48ad 100644
--- a/torchaudio/csrc/sox/io.h
+++ b/torchaudio/csrc/sox/io.h
@@ -4,8 +4,7 @@
 #include <torch/script.h>
 #include <torchaudio/csrc/sox/utils.h>
 
-namespace torchaudio {
-namespace sox_io {
+namespace torchaudio::sox {
 
 auto get_effects(
     const c10::optional<int64_t>& frame_offset,
@@ -37,7 +36,6 @@ void save_audio_file(
     c10::optional<std::string> encoding,
     c10::optional<int64_t> bits_per_sample);
 
-} // namespace sox_io
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/pybind/effects.cpp b/torchaudio/csrc/sox/pybind/effects.cpp
index db80f98d63..9b9e04bb05 100644
--- a/torchaudio/csrc/sox/pybind/effects.cpp
+++ b/torchaudio/csrc/sox/pybind/effects.cpp
@@ -2,10 +2,7 @@
 #include <torchaudio/csrc/sox/pybind/effects_chain.h>
 #include <torchaudio/csrc/sox/pybind/utils.h>
 
-using namespace torchaudio::sox_utils;
-
-namespace torchaudio {
-namespace sox_effects {
+namespace torchaudio::sox {
 
 // Streaming decoding over file-like object is tricky because libsox operates on
 // FILE pointer. The folloing is what `sox` and `play` commands do
@@ -95,7 +92,7 @@ auto apply_effects_fileobj(
 
   // Create and run SoxEffectsChain
   const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-  torchaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+  SoxEffectsChainPyBind chain(
       /*input_encoding=*/sf->encoding,
       /*output_encoding=*/get_tensor_encodinginfo(dtype));
   chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
@@ -119,5 +116,4 @@ auto apply_effects_fileobj(
       tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
 }
 
-} // namespace sox_effects
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/pybind/effects.h b/torchaudio/csrc/sox/pybind/effects.h
index 7f1e653cd5..1cdcef3330 100644
--- a/torchaudio/csrc/sox/pybind/effects.h
+++ b/torchaudio/csrc/sox/pybind/effects.h
@@ -3,8 +3,7 @@
 
 #include <torch/extension.h>
 
-namespace torchaudio {
-namespace sox_effects {
+namespace torchaudio::sox {
 
 auto apply_effects_fileobj(
     py::object fileobj,
@@ -14,7 +13,6 @@ auto apply_effects_fileobj(
     c10::optional<std::string> format)
     -> c10::optional<std::tuple<torch::Tensor, int64_t>>;
 
-} // namespace sox_effects
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/pybind/effects_chain.cpp b/torchaudio/csrc/sox/pybind/effects_chain.cpp
index 42128433d6..237358a0e0 100644
--- a/torchaudio/csrc/sox/pybind/effects_chain.cpp
+++ b/torchaudio/csrc/sox/pybind/effects_chain.cpp
@@ -2,11 +2,7 @@
 #include <torchaudio/csrc/sox/pybind/effects_chain.h>
 #include <torchaudio/csrc/sox/pybind/utils.h>
 
-using namespace torchaudio::sox_utils;
-
-namespace torchaudio {
-namespace sox_effects_chain {
-
+namespace torchaudio::sox {
 namespace {
 
 /// helper classes for passing file-like object to SoxEffectChain
@@ -233,5 +229,4 @@ void SoxEffectsChainPyBind::addOutputFileObj(
   }
 }
 
-} // namespace sox_effects_chain
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/pybind/effects_chain.h b/torchaudio/csrc/sox/pybind/effects_chain.h
index acbacf6013..e5ae5bfd84 100644
--- a/torchaudio/csrc/sox/pybind/effects_chain.h
+++ b/torchaudio/csrc/sox/pybind/effects_chain.h
@@ -4,8 +4,7 @@
 #include <torch/extension.h>
 #include <torchaudio/csrc/sox/effects_chain.h>
 
-namespace torchaudio {
-namespace sox_effects_chain {
+namespace torchaudio::sox {
 
 class SoxEffectsChainPyBind : public SoxEffectsChain {
   using SoxEffectsChain::SoxEffectsChain;
@@ -24,7 +23,6 @@ class SoxEffectsChainPyBind : public SoxEffectsChain {
       py::object* fileobj);
 };
 
-} // namespace sox_effects_chain
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/pybind/io.cpp b/torchaudio/csrc/sox/pybind/io.cpp
index 5fc6d271b5..0ccf8416a6 100644
--- a/torchaudio/csrc/sox/pybind/io.cpp
+++ b/torchaudio/csrc/sox/pybind/io.cpp
@@ -7,10 +7,7 @@
 
 #include <utility>
 
-using namespace torchaudio::sox_utils;
-
-namespace torchaudio {
-namespace sox_io {
+namespace torchaudio::sox {
 
 auto get_info_fileobj(py::object fileobj, c10::optional<std::string> format)
     -> c10::optional<MetaDataTuple> {
@@ -83,7 +80,7 @@ auto load_audio_fileobj(
     c10::optional<std::string> format)
     -> c10::optional<std::tuple<torch::Tensor, int64_t>> {
   auto effects = get_effects(frame_offset, num_frames);
-  return torchaudio::sox_effects::apply_effects_fileobj(
+  return apply_effects_fileobj(
       std::move(fileobj),
       effects,
       normalize,
@@ -177,7 +174,7 @@ void save_audio_fileobj(
         "Error saving audio file: failed to open memory stream.");
   }
 
-  torchaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+  SoxEffectsChainPyBind chain(
       /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
       /*output_encoding=*/sf->encoding);
   chain.addInputTensor(&tensor, sample_rate, channels_first);
@@ -191,5 +188,4 @@ void save_audio_fileobj(
   fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
 }
 
-} // namespace sox_io
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/pybind/io.h b/torchaudio/csrc/sox/pybind/io.h
index db91ad4ace..02d874c350 100644
--- a/torchaudio/csrc/sox/pybind/io.h
+++ b/torchaudio/csrc/sox/pybind/io.h
@@ -3,8 +3,7 @@
 
 #include <torch/extension.h>
 
-namespace torchaudio {
-namespace sox_io {
+namespace torchaudio::sox {
 
 using MetaDataTuple =
     std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
@@ -31,7 +30,6 @@ void save_audio_fileobj(
     c10::optional<std::string> encoding,
     c10::optional<int64_t> bits_per_sample);
 
-} // namespace sox_io
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/pybind/pybind.cpp b/torchaudio/csrc/sox/pybind/pybind.cpp
index 751471c52e..f324bf3159 100644
--- a/torchaudio/csrc/sox/pybind/pybind.cpp
+++ b/torchaudio/csrc/sox/pybind/pybind.cpp
@@ -1,23 +1,27 @@
 #include <torch/extension.h>
+#include <torchaudio/csrc/sox/utils.h>
 
-#include <torchaudio/csrc/sox/pybind/effects.h>
-#include <torchaudio/csrc/sox/pybind/io.h>
+namespace torchaudio {
+namespace sox {
+namespace {
 
 PYBIND11_MODULE(_torchaudio_sox, m) {
+  m.def("set_seed", &set_seed, "Set random seed.");
+  m.def("set_verbosity", &set_verbosity, "Set verbosity.");
+  m.def("set_use_threads", &set_use_threads, "Set threading.");
+  m.def("set_buffer_size", &set_buffer_size, "Set buffer size.");
+  m.def("get_buffer_size", &get_buffer_size, "Get buffer size.");
+  m.def("list_effects", &list_effects, "List available effects.");
   m.def(
-      "get_info_fileobj",
-      &torchaudio::sox_io::get_info_fileobj,
-      "Get metadata of audio in file object.");
+      "list_read_formats",
+      &list_read_formats,
+      "List supported formats for decoding.");
   m.def(
-      "load_audio_fileobj",
-      &torchaudio::sox_io::load_audio_fileobj,
-      "Load audio from file object.");
-  m.def(
-      "save_audio_fileobj",
-      &torchaudio::sox_io::save_audio_fileobj,
-      "Save audio to file obj.");
-  m.def(
-      "apply_effects_fileobj",
-      &torchaudio::sox_effects::apply_effects_fileobj,
-      "Decode audio data from file-like obj and apply effects.");
+      "list_write_formats",
+      &list_write_formats,
+      "List supported formats for encoding.");
 }
+
+} // namespace
+} // namespace sox
+} // namespace torchaudio
diff --git a/torchaudio/csrc/sox/pybind/utils.cpp b/torchaudio/csrc/sox/pybind/utils.cpp
index 1744be281a..5c805a8124 100644
--- a/torchaudio/csrc/sox/pybind/utils.cpp
+++ b/torchaudio/csrc/sox/pybind/utils.cpp
@@ -1,7 +1,6 @@
 #include <torchaudio/csrc/sox/pybind/utils.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 auto read_fileobj(py::object* fileobj, const uint64_t size, char* buffer)
     -> uint64_t {
@@ -29,5 +28,4 @@ auto read_fileobj(py::object* fileobj, const uint64_t size, char* buffer)
   return num_read;
 }
 
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/pybind/utils.h b/torchaudio/csrc/sox/pybind/utils.h
index 21955e255c..09df3c3416 100644
--- a/torchaudio/csrc/sox/pybind/utils.h
+++ b/torchaudio/csrc/sox/pybind/utils.h
@@ -3,12 +3,10 @@
 
 #include <torch/extension.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 auto read_fileobj(py::object* fileobj, uint64_t size, char* buffer) -> uint64_t;
 
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/types.cpp b/torchaudio/csrc/sox/types.cpp
index 9beaadda40..35ec0d45d7 100644
--- a/torchaudio/csrc/sox/types.cpp
+++ b/torchaudio/csrc/sox/types.cpp
@@ -1,7 +1,6 @@
 #include <torchaudio/csrc/sox/types.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 Format get_format_from_string(const std::string& format) {
   if (format == "wav")
@@ -58,7 +57,7 @@ std::string to_string(Encoding v) {
   }
 }
 
-Encoding get_encoding_from_option(const c10::optional<std::string> encoding) {
+Encoding get_encoding_from_option(const c10::optional<std::string>& encoding) {
   if (!encoding.has_value())
     return Encoding::NOT_PROVIDED;
   std::string v = encoding.value();
@@ -75,7 +74,7 @@ Encoding get_encoding_from_option(const c10::optional<std::string> encoding) {
   TORCH_CHECK(false, "Internal Error: unexpected encoding value: ", v);
 }
 
-BitDepth get_bit_depth_from_option(const c10::optional<int64_t> bit_depth) {
+BitDepth get_bit_depth_from_option(const c10::optional<int64_t>& bit_depth) {
   if (!bit_depth.has_value())
     return BitDepth::NOT_PROVIDED;
   int64_t v = bit_depth.value();
@@ -129,5 +128,4 @@ std::string get_encoding(sox_encoding_t encoding) {
   }
 }
 
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/types.h b/torchaudio/csrc/sox/types.h
index afd84791a6..5a15556ea9 100644
--- a/torchaudio/csrc/sox/types.h
+++ b/torchaudio/csrc/sox/types.h
@@ -4,8 +4,7 @@
 #include <sox.h>
 #include <torch/script.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 enum class Format {
   WAV,
@@ -39,7 +38,7 @@ enum class Encoding {
 };
 
 std::string to_string(Encoding v);
-Encoding get_encoding_from_option(const c10::optional<std::string> encoding);
+Encoding get_encoding_from_option(const c10::optional<std::string>& encoding);
 
 enum class BitDepth : unsigned {
   NOT_PROVIDED = 0,
@@ -50,11 +49,10 @@ enum class BitDepth : unsigned {
   B64 = 64,
 };
 
-BitDepth get_bit_depth_from_option(const c10::optional<int64_t> bit_depth);
+BitDepth get_bit_depth_from_option(const c10::optional<int64_t>& bit_depth);
 
 std::string get_encoding(sox_encoding_t encoding);
 
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
 
 #endif
diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp
index 1308d21d75..5b662bd6ff 100644
--- a/torchaudio/csrc/sox/utils.cpp
+++ b/torchaudio/csrc/sox/utils.cpp
@@ -3,8 +3,7 @@
 #include <torchaudio/csrc/sox/types.h>
 #include <torchaudio/csrc/sox/utils.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 void set_seed(const int64_t seed) {
   sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
@@ -94,7 +93,7 @@ void validate_input_file(const SoxFormat& sf, const std::string& path) {
       "Error loading audio file: unknown encoding.");
 }
 
-void validate_input_tensor(const torch::Tensor tensor) {
+void validate_input_tensor(const torch::Tensor& tensor) {
   TORCH_CHECK(tensor.device().is_cpu(), "Input tensor has to be on CPU.");
 
   TORCH_CHECK(tensor.ndimension() == 2, "Input tensor has to be 2D.");
@@ -185,7 +184,7 @@ torch::Tensor convert_to_tensor(
   return t.contiguous();
 }
 
-const std::string get_filetype(const std::string path) {
+const std::string get_filetype(const std::string& path) {
   std::string ext = path.substr(path.find_last_of(".") + 1);
   std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
   return ext;
@@ -279,9 +278,9 @@ std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
 
 std::tuple<sox_encoding_t, unsigned> get_save_encoding(
     const std::string& format,
-    const caffe2::TypeMeta dtype,
-    const c10::optional<std::string> encoding,
-    const c10::optional<int64_t> bits_per_sample) {
+    const caffe2::TypeMeta& dtype,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample) {
   const Format fmt = get_format_from_string(format);
   const Encoding enc = get_encoding_from_option(encoding);
   const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
@@ -386,7 +385,7 @@ std::tuple<sox_encoding_t, unsigned> get_save_encoding(
   }
 }
 
-unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) {
+unsigned get_precision(const std::string& filetype, caffe2::TypeMeta dtype) {
   if (filetype == "mp3")
     return SOX_UNSPEC;
   if (filetype == "flac")
@@ -426,7 +425,7 @@ unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) {
 sox_signalinfo_t get_signalinfo(
     const torch::Tensor* waveform,
     const int64_t sample_rate,
-    const std::string filetype,
+    const std::string& filetype,
     const bool channels_first) {
   return sox_signalinfo_t{
       /*rate=*/static_cast<sox_rate_t>(sample_rate),
@@ -477,10 +476,10 @@ sox_encodinginfo_t get_tensor_encodinginfo(caffe2::TypeMeta dtype) {
 
 sox_encodinginfo_t get_encodinginfo_for_save(
     const std::string& format,
-    const caffe2::TypeMeta dtype,
-    const c10::optional<double> compression,
-    const c10::optional<std::string> encoding,
-    const c10::optional<int64_t> bits_per_sample) {
+    const caffe2::TypeMeta& dtype,
+    const c10::optional<double>& compression,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample) {
   auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
   return sox_encodinginfo_t{
       /*encoding=*/std::get<0>(enc),
@@ -492,30 +491,4 @@ sox_encodinginfo_t get_encodinginfo_for_save(
       /*opposite_endian=*/sox_false};
 }
 
-TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::sox_utils_set_seed", &torchaudio::sox_utils::set_seed);
-  m.def(
-      "torchaudio::sox_utils_set_verbosity",
-      &torchaudio::sox_utils::set_verbosity);
-  m.def(
-      "torchaudio::sox_utils_set_use_threads",
-      &torchaudio::sox_utils::set_use_threads);
-  m.def(
-      "torchaudio::sox_utils_set_buffer_size",
-      &torchaudio::sox_utils::set_buffer_size);
-  m.def(
-      "torchaudio::sox_utils_list_effects",
-      &torchaudio::sox_utils::list_effects);
-  m.def(
-      "torchaudio::sox_utils_list_read_formats",
-      &torchaudio::sox_utils::list_read_formats);
-  m.def(
-      "torchaudio::sox_utils_list_write_formats",
-      &torchaudio::sox_utils::list_write_formats);
-  m.def(
-      "torchaudio::sox_utils_get_buffer_size",
-      &torchaudio::sox_utils::get_buffer_size);
-}
-
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
diff --git a/torchaudio/csrc/sox/utils.h b/torchaudio/csrc/sox/utils.h
index ca84b60043..255d7270fe 100644
--- a/torchaudio/csrc/sox/utils.h
+++ b/torchaudio/csrc/sox/utils.h
@@ -4,8 +4,7 @@
 #include <sox.h>
 #include <torch/script.h>
 
-namespace torchaudio {
-namespace sox_utils {
+namespace torchaudio::sox {
 
 ////////////////////////////////////////////////////////////////////////////////
 // APIs for Python interaction
@@ -54,7 +53,7 @@ struct SoxFormat {
 
 ///
 /// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
-void validate_input_tensor(const torch::Tensor);
+void validate_input_tensor(const torch::Tensor&);
 
 ///
 /// Get target dtype for the given encoding and precision.
@@ -86,13 +85,13 @@ torch::Tensor convert_to_tensor(
     const bool channels_first);
 
 /// Extract extension from file path
-const std::string get_filetype(const std::string path);
+const std::string get_filetype(const std::string& path);
 
 /// Get sox_signalinfo_t for passing a torch::Tensor object.
 sox_signalinfo_t get_signalinfo(
     const torch::Tensor* waveform,
     const int64_t sample_rate,
-    const std::string filetype,
+    const std::string& filetype,
     const bool channels_first);
 
 /// Get sox_encodinginfo_t for Tensor I/O
@@ -101,11 +100,10 @@ sox_encodinginfo_t get_tensor_encodinginfo(const caffe2::TypeMeta dtype);
 /// Get sox_encodinginfo_t for saving to file/file object
 sox_encodinginfo_t get_encodinginfo_for_save(
     const std::string& format,
-    const caffe2::TypeMeta dtype,
-    const c10::optional<double> compression,
-    const c10::optional<std::string> encoding,
-    const c10::optional<int64_t> bits_per_sample);
+    const caffe2::TypeMeta& dtype,
+    const c10::optional<double>& compression,
+    const c10::optional<std::string>& encoding,
+    const c10::optional<int64_t>& bits_per_sample);
 
-} // namespace sox_utils
-} // namespace torchaudio
+} // namespace torchaudio::sox
 #endif
diff --git a/torchaudio/csrc/utils.cpp b/torchaudio/csrc/utils.cpp
index c76a4ffa7a..8c5898cb49 100644
--- a/torchaudio/csrc/utils.cpp
+++ b/torchaudio/csrc/utils.cpp
@@ -7,14 +7,6 @@
 
 namespace torchaudio {
 
-bool is_kaldi_available() {
-#ifdef INCLUDE_KALDI
-  return true;
-#else
-  return false;
-#endif
-}
-
 bool is_rir_available() {
 #ifdef INCLUDE_RIR
   return true;
diff --git a/torchaudio/csrc/utils.h b/torchaudio/csrc/utils.h
index 751cfa1ad2..1b2be53ee8 100644
--- a/torchaudio/csrc/utils.h
+++ b/torchaudio/csrc/utils.h
@@ -1,8 +1,7 @@
 #pragma once
-#include <torch/torch.h>
+#include <torch/types.h>
 
 namespace torchaudio {
-bool is_kaldi_available();
 bool is_rir_available();
 bool is_align_available();
 c10::optional<int64_t> cuda_version();
diff --git a/torchaudio/datasets/cmudict.py b/torchaudio/datasets/cmudict.py
index ea8d00a53d..3ec2ae7fea 100644
--- a/torchaudio/datasets/cmudict.py
+++ b/torchaudio/datasets/cmudict.py
@@ -10,71 +10,69 @@
     "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",  # noqa: E501
     "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",  # noqa: E501
 }
-_PUNCTUATIONS = set(
-    [
-        "!EXCLAMATION-POINT",
-        '"CLOSE-QUOTE',
-        '"DOUBLE-QUOTE',
-        '"END-OF-QUOTE',
-        '"END-QUOTE',
-        '"IN-QUOTES',
-        '"QUOTE',
-        '"UNQUOTE',
-        "#HASH-MARK",
-        "#POUND-SIGN",
-        "#SHARP-SIGN",
-        "%PERCENT",
-        "&AMPERSAND",
-        "'END-INNER-QUOTE",
-        "'END-QUOTE",
-        "'INNER-QUOTE",
-        "'QUOTE",
-        "'SINGLE-QUOTE",
-        "(BEGIN-PARENS",
-        "(IN-PARENTHESES",
-        "(LEFT-PAREN",
-        "(OPEN-PARENTHESES",
-        "(PAREN",
-        "(PARENS",
-        "(PARENTHESES",
-        ")CLOSE-PAREN",
-        ")CLOSE-PARENTHESES",
-        ")END-PAREN",
-        ")END-PARENS",
-        ")END-PARENTHESES",
-        ")END-THE-PAREN",
-        ")PAREN",
-        ")PARENS",
-        ")RIGHT-PAREN",
-        ")UN-PARENTHESES",
-        "+PLUS",
-        ",COMMA",
-        "--DASH",
-        "-DASH",
-        "-HYPHEN",
-        "...ELLIPSIS",
-        ".DECIMAL",
-        ".DOT",
-        ".FULL-STOP",
-        ".PERIOD",
-        ".POINT",
-        "/SLASH",
-        ":COLON",
-        ";SEMI-COLON",
-        ";SEMI-COLON(1)",
-        "?QUESTION-MARK",
-        "{BRACE",
-        "{LEFT-BRACE",
-        "{OPEN-BRACE",
-        "}CLOSE-BRACE",
-        "}RIGHT-BRACE",
-    ]
-)
+_PUNCTUATIONS = {
+    "!EXCLAMATION-POINT",
+    '"CLOSE-QUOTE',
+    '"DOUBLE-QUOTE',
+    '"END-OF-QUOTE',
+    '"END-QUOTE',
+    '"IN-QUOTES',
+    '"QUOTE',
+    '"UNQUOTE',
+    "#HASH-MARK",
+    "#POUND-SIGN",
+    "#SHARP-SIGN",
+    "%PERCENT",
+    "&AMPERSAND",
+    "'END-INNER-QUOTE",
+    "'END-QUOTE",
+    "'INNER-QUOTE",
+    "'QUOTE",
+    "'SINGLE-QUOTE",
+    "(BEGIN-PARENS",
+    "(IN-PARENTHESES",
+    "(LEFT-PAREN",
+    "(OPEN-PARENTHESES",
+    "(PAREN",
+    "(PARENS",
+    "(PARENTHESES",
+    ")CLOSE-PAREN",
+    ")CLOSE-PARENTHESES",
+    ")END-PAREN",
+    ")END-PARENS",
+    ")END-PARENTHESES",
+    ")END-THE-PAREN",
+    ")PAREN",
+    ")PARENS",
+    ")RIGHT-PAREN",
+    ")UN-PARENTHESES",
+    "+PLUS",
+    ",COMMA",
+    "--DASH",
+    "-DASH",
+    "-HYPHEN",
+    "...ELLIPSIS",
+    ".DECIMAL",
+    ".DOT",
+    ".FULL-STOP",
+    ".PERIOD",
+    ".POINT",
+    "/SLASH",
+    ":COLON",
+    ";SEMI-COLON",
+    ";SEMI-COLON(1)",
+    "?QUESTION-MARK",
+    "{BRACE",
+    "{LEFT-BRACE",
+    "{OPEN-BRACE",
+    "}CLOSE-BRACE",
+    "}RIGHT-BRACE",
+}
 
 
 def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
     _alt_re = re.compile(r"\([0-9]+\)")
-    cmudict: List[Tuple[str, List[str]]] = list()
+    cmudict: List[Tuple[str, List[str]]] = []
     for line in lines:
         if not line or line.startswith(";;;"):  # ignore comments
             continue
diff --git a/torchaudio/functional/__init__.py b/torchaudio/functional/__init__.py
index 6c874d28a6..5f06a8a837 100644
--- a/torchaudio/functional/__init__.py
+++ b/torchaudio/functional/__init__.py
@@ -28,7 +28,6 @@
     apply_beamforming,
     apply_codec,
     compute_deltas,
-    compute_kaldi_pitch,
     convolve,
     create_dct,
     DB_to_amplitude,
@@ -65,7 +64,6 @@
 __all__ = [
     "amplitude_to_DB",
     "compute_deltas",
-    "compute_kaldi_pitch",
     "create_dct",
     "melscale_fbanks",
     "linear_fbanks",
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 6c8e7f5b9c..dc65a8cfa8 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -1390,11 +1390,11 @@ def _measure(
     cepstrum_end: int,
     noise_reduction_amount: float,
     measure_smooth_time_mult: float,
-    noise_up_time_mult: float,
-    noise_down_time_mult: float,
-    index_ns: int,
+    noise_up_time_mult: Tensor,
+    noise_down_time_mult: Tensor,
     boot_count: int,
 ) -> float:
+    device = samples.device
 
     if spectrum.size(-1) != noise_spectrum.size(-1):
         raise ValueError(
@@ -1402,37 +1402,29 @@ def _measure(
             f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
         )
 
-    samplesLen_ns = samples.size()[-1]
     dft_len_ws = spectrum.size()[-1]
 
-    dftBuf = torch.zeros(dft_len_ws)
+    dftBuf = torch.zeros(dft_len_ws, device=device)
 
-    _index_ns = torch.tensor([index_ns] + [(index_ns + i) % samplesLen_ns for i in range(1, measure_len_ws)])
-    dftBuf[:measure_len_ws] = samples[_index_ns] * spectrum_window[:measure_len_ws]
-
-    # memset(c->dftBuf + i, 0, (p->dft_len_ws - i) * sizeof(*c->dftBuf));
-    dftBuf[measure_len_ws:dft_len_ws].zero_()
+    dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
 
     # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
     _dftBuf = torch.fft.rfft(dftBuf)
 
-    # memset(c->dftBuf, 0, p->spectrum_start * sizeof(*c->dftBuf));
-    _dftBuf[:spectrum_start].zero_()
-
     mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
 
     _d = _dftBuf[spectrum_start:spectrum_end].abs()
     spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
     _d = spectrum[spectrum_start:spectrum_end] ** 2
 
-    _zeros = torch.zeros(spectrum_end - spectrum_start)
+    _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
     _mult = (
         _zeros
         if boot_count >= 0
         else torch.where(
             _d > noise_spectrum[spectrum_start:spectrum_end],
-            torch.tensor(noise_up_time_mult),  # if
-            torch.tensor(noise_down_time_mult),  # else
+            noise_up_time_mult,  # if
+            noise_down_time_mult,  # else,
         )
     )
 
@@ -1441,10 +1433,10 @@ def _measure(
         torch.max(
             _zeros,
             _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
-        )
+        ),
     )
 
-    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1)
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
     _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
     _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
 
@@ -1539,6 +1531,7 @@ def vad(
     Reference:
         - http://sox.sourceforge.net/sox.html
     """
+    device = waveform.device
 
     if waveform.ndim > 2:
         warnings.warn(
@@ -1566,23 +1559,23 @@ def vad(
     fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
     samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
 
-    spectrum_window = torch.zeros(measure_len_ws)
+    spectrum_window = torch.zeros(measure_len_ws, device=device)
     for i in range(measure_len_ws):
         # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
         spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
     # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
-    spectrum_window *= torch.hann_window(measure_len_ws, dtype=torch.float)
+    spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
 
     spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
     spectrum_start: int = max(spectrum_start, 1)
     spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
     spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
 
-    cepstrum_window = torch.zeros(spectrum_end - spectrum_start)
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
     for i in range(spectrum_end - spectrum_start):
         cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
     # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
-    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, dtype=torch.float)
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
 
     cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
     cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
@@ -1594,14 +1587,13 @@ def vad(
             f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
         )
 
-    noise_up_time_mult = math.exp(-1.0 / (noise_up_time * measure_freq))
-    noise_down_time_mult = math.exp(-1.0 / (noise_down_time * measure_freq))
+    noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
+    noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
     measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
     trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
 
     boot_count_max = int(boot_time * measure_freq - 0.5)
-    measure_timer_ns = measure_len_ns
-    boot_count = measures_index = flushedLen_ns = samplesIndex_ns = 0
+    boot_count = measures_index = flushedLen_ns = 0
 
     # pack batch
     shape = waveform.size()
@@ -1609,80 +1601,65 @@ def vad(
 
     n_channels, ilen = waveform.size()
 
-    mean_meas = torch.zeros(n_channels)
-    samples = torch.zeros(n_channels, samplesLen_ns)
-    spectrum = torch.zeros(n_channels, dft_len_ws)
-    noise_spectrum = torch.zeros(n_channels, dft_len_ws)
-    measures = torch.zeros(n_channels, measures_len)
+    mean_meas = torch.zeros(n_channels, device=device)
+    spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    measures = torch.zeros(n_channels, measures_len, device=device)
 
     has_triggered: bool = False
     num_measures_to_flush: int = 0
-    pos: int = 0
 
-    while pos < ilen and not has_triggered:
-        measure_timer_ns -= 1
+    pos = 0
+    for pos in range(measure_len_ns, ilen, measure_period_ns):
         for i in range(n_channels):
-            samples[i, samplesIndex_ns] = waveform[i, pos]
-            # if (!p->measure_timer_ns) {
-            if measure_timer_ns == 0:
-                index_ns: int = (samplesIndex_ns + samplesLen_ns - measure_len_ns) % samplesLen_ns
-                meas: float = _measure(
-                    measure_len_ws=measure_len_ws,
-                    samples=samples[i],
-                    spectrum=spectrum[i],
-                    noise_spectrum=noise_spectrum[i],
-                    spectrum_window=spectrum_window,
-                    spectrum_start=spectrum_start,
-                    spectrum_end=spectrum_end,
-                    cepstrum_window=cepstrum_window,
-                    cepstrum_start=cepstrum_start,
-                    cepstrum_end=cepstrum_end,
-                    noise_reduction_amount=noise_reduction_amount,
-                    measure_smooth_time_mult=measure_smooth_time_mult,
-                    noise_up_time_mult=noise_up_time_mult,
-                    noise_down_time_mult=noise_down_time_mult,
-                    index_ns=index_ns,
-                    boot_count=boot_count,
-                )
-                measures[i, measures_index] = meas
-                mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
-
-                has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
-                if has_triggered:
-                    n: int = measures_len
-                    k: int = measures_index
-                    jTrigger: int = n
-                    jZero: int = n
-                    j: int = 0
-
-                    for j in range(n):
-                        if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
-                            jZero = jTrigger = j
-                        elif (measures[i, k] == 0) and (jTrigger >= jZero):
-                            jZero = j
-                        k = (k + n - 1) % n
-                    j = min(j, jZero)
-                    # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
-                    num_measures_to_flush = min(max(num_measures_to_flush, j), n)
-                # end if has_triggered
-            # end if (measure_timer_ns == 0):
-        # end for
-        samplesIndex_ns += 1
-        pos += 1
-        # end while
-        if samplesIndex_ns == samplesLen_ns:
-            samplesIndex_ns = 0
-        if measure_timer_ns == 0:
-            measure_timer_ns = measure_period_ns
-            measures_index += 1
-            measures_index = measures_index % measures_len
-            if boot_count >= 0:
-                boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+            meas: float = _measure(
+                measure_len_ws=measure_len_ws,
+                samples=waveform[i, pos - measure_len_ws : pos],
+                spectrum=spectrum[i],
+                noise_spectrum=noise_spectrum[i],
+                spectrum_window=spectrum_window,
+                spectrum_start=spectrum_start,
+                spectrum_end=spectrum_end,
+                cepstrum_window=cepstrum_window,
+                cepstrum_start=cepstrum_start,
+                cepstrum_end=cepstrum_end,
+                noise_reduction_amount=noise_reduction_amount,
+                measure_smooth_time_mult=measure_smooth_time_mult,
+                noise_up_time_mult=noise_up_time_mult,
+                noise_down_time_mult=noise_down_time_mult,
+                boot_count=boot_count,
+            )
+            measures[i, measures_index] = meas
+            mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+
+            has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+            if has_triggered:
+                n: int = measures_len
+                k: int = measures_index
+                jTrigger: int = n
+                jZero: int = n
+                j: int = 0
+
+                for j in range(n):
+                    if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                        jZero = jTrigger = j
+                    elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                        jZero = j
+                    k = (k + n - 1) % n
+                j = min(j, jZero)
+                # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+            # end if has_triggered
+        # end for channel
+        measures_index += 1
+        measures_index = measures_index % measures_len
+        if boot_count >= 0:
+            boot_count = -1 if boot_count == boot_count_max else boot_count + 1
 
         if has_triggered:
             flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
-            samplesIndex_ns = (samplesIndex_ns + flushedLen_ns) % samplesLen_ns
-
+            break
+    # end for window
     res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
     # unpack batch
     return res.view(shape[:-1] + res.shape[-1:])
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index 2206690d29..8b732cf663 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
-import io
 import math
+import tempfile
 import warnings
 from collections.abc import Sequence
 from typing import List, Optional, Tuple, Union
@@ -10,6 +10,7 @@
 import torchaudio
 from torch import Tensor
 from torchaudio._extension import fail_if_no_align
+from torchaudio._internal.module_utils import deprecated
 
 from .filtering import highpass_biquad, treble_biquad
 
@@ -20,7 +21,6 @@
     "amplitude_to_DB",
     "DB_to_amplitude",
     "compute_deltas",
-    "compute_kaldi_pitch",
     "melscale_fbanks",
     "linear_fbanks",
     "create_dct",
@@ -1290,6 +1290,7 @@ def spectral_centroid(
 
 
 @torchaudio._extension.fail_if_no_sox
+@deprecated("Please migrate to torchaudio.io.AudioEffector.", remove=False)
 def apply_codec(
     waveform: Tensor,
     sample_rate: int,
@@ -1304,6 +1305,12 @@ def apply_codec(
 
     .. devices:: CPU
 
+    .. warning::
+
+       This function has been deprecated.
+       Please migrate to :py:class:`torchaudio.io.AudioEffector`, which works on all platforms,
+       and supports streaming processing.
+
     Args:
         waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
         sample_rate (int): Sample rate of the audio waveform.
@@ -1322,129 +1329,17 @@ def apply_codec(
         Tensor: Resulting Tensor.
         If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
     """
-    bytes = io.BytesIO()
-    torchaudio.backend.sox_io_backend.save(
-        bytes, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
-    )
-    bytes.seek(0)
-    augmented, sr = torchaudio.backend.sox_io_backend.load(bytes, channels_first=channels_first, format=format)
+    with tempfile.NamedTemporaryFile() as f:
+        torchaudio.backend.sox_io_backend.save(
+            f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        )
+        augmented, sr = torchaudio.backend.sox_io_backend.load(f.name, channels_first=channels_first, format=format)
     if sr != sample_rate:
         augmented = resample(augmented, sr, sample_rate)
     return augmented
 
 
-@torchaudio._extension.fail_if_no_kaldi
-def compute_kaldi_pitch(
-    waveform: torch.Tensor,
-    sample_rate: float,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    min_f0: float = 50,
-    max_f0: float = 400,
-    soft_min_f0: float = 10.0,
-    penalty_factor: float = 0.1,
-    lowpass_cutoff: float = 1000,
-    resample_frequency: float = 4000,
-    delta_pitch: float = 0.005,
-    nccf_ballast: float = 7000,
-    lowpass_filter_width: int = 1,
-    upsample_filter_width: int = 5,
-    max_frames_latency: int = 0,
-    frames_per_chunk: int = 0,
-    simulate_first_pass_online: bool = False,
-    recompute_frame: int = 500,
-    snip_edges: bool = True,
-) -> torch.Tensor:
-    """Extract pitch based on method described in *A pitch extraction algorithm tuned
-    for automatic speech recognition* :cite:`6854049`.
-
-    .. devices:: CPU
-
-    .. properties:: TorchScript
-
-    This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
-
-    Args:
-        waveform (Tensor):
-            The input waveform of shape `(..., time)`.
-        sample_rate (float):
-            Sample rate of `waveform`.
-        frame_length (float, optional):
-            Frame length in milliseconds. (default: 25.0)
-        frame_shift (float, optional):
-            Frame shift in milliseconds. (default: 10.0)
-        min_f0 (float, optional):
-            Minimum F0 to search for (Hz)  (default: 50.0)
-        max_f0 (float, optional):
-            Maximum F0 to search for (Hz)  (default: 400.0)
-        soft_min_f0 (float, optional):
-            Minimum f0, applied in soft way, must not exceed min-f0  (default: 10.0)
-        penalty_factor (float, optional):
-            Cost factor for FO change.  (default: 0.1)
-        lowpass_cutoff (float, optional):
-            Cutoff frequency for LowPass filter (Hz) (default: 1000)
-        resample_frequency (float, optional):
-            Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff.
-            (default: 4000)
-        delta_pitch( float, optional):
-            Smallest relative change in pitch that our algorithm measures. (default: 0.005)
-        nccf_ballast (float, optional):
-            Increasing this factor reduces NCCF for quiet frames (default: 7000)
-        lowpass_filter_width (int, optional):
-            Integer that determines filter width of lowpass filter, more gives sharper filter.
-            (default: 1)
-        upsample_filter_width (int, optional):
-            Integer that determines filter width when upsampling NCCF. (default: 5)
-        max_frames_latency (int, optional):
-            Maximum number of frames of latency that we allow pitch tracking to introduce into
-            the feature processing (affects output only if ``frames_per_chunk > 0`` and
-            ``simulate_first_pass_online=True``) (default: 0)
-        frames_per_chunk (int, optional):
-            The number of frames used for energy normalization. (default: 0)
-        simulate_first_pass_online (bool, optional):
-            If true, the function will output features that correspond to what an online decoder
-            would see in the first pass of decoding -- not the final version of the features,
-            which is the default. (default: False)
-            Relevant if ``frames_per_chunk > 0``.
-        recompute_frame (int, optional):
-            Only relevant for compatibility with online pitch extraction.
-            A non-critical parameter; the frame at which we recompute some of the forward pointers,
-            after revising our estimate of the signal energy.
-            Relevant if ``frames_per_chunk > 0``. (default: 500)
-        snip_edges (bool, optional):
-            If this is set to false, the incomplete frames near the ending edge won't be snipped,
-            so that the number of frames is the file size divided by the frame-shift.
-            This makes different types of features give the same number of frames. (default: True)
-
-    Returns:
-       Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension
-       corresponds to pitch and NCCF.
-    """
-    shape = waveform.shape
-    waveform = waveform.reshape(-1, shape[-1])
-    result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch(
-        waveform,
-        sample_rate,
-        frame_length,
-        frame_shift,
-        min_f0,
-        max_f0,
-        soft_min_f0,
-        penalty_factor,
-        lowpass_cutoff,
-        resample_frequency,
-        delta_pitch,
-        nccf_ballast,
-        lowpass_filter_width,
-        upsample_filter_width,
-        max_frames_latency,
-        frames_per_chunk,
-        simulate_first_pass_online,
-        recompute_frame,
-        snip_edges,
-    )
-    result = result.reshape(shape[:-1] + result.shape[-2:])
-    return result
+_CPU = torch.device("cpu")
 
 
 def _get_sinc_resample_kernel(
@@ -1455,7 +1350,7 @@ def _get_sinc_resample_kernel(
     rolloff: float = 0.99,
     resampling_method: str = "sinc_interp_hann",
     beta: Optional[float] = None,
-    device: torch.device = torch.device("cpu"),
+    device: torch.device = _CPU,
     dtype: Optional[torch.dtype] = None,
 ):
     if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
@@ -2616,12 +2511,12 @@ def forced_align(
 
     Args:
         log_probs (torch.Tensor): log probability of CTC emission output.
-            Tensor of shape `(T, C)`. where `T` is the input length,
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
             `C` is the number of characters in alphabet including blank.
-        targets (torch.Tensor): Target sequence. Tensor of shape `(L,)`,
+        targets (torch.Tensor): Target sequence. Tensor of shape `(B, L)`,
             where `L` is the target length.
-        input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 0-D Tensor (scalar).
-        target_lengths (torch.Tensor): Lengths of the targets. 0-D Tensor (scalar).
+        input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (torch.Tensor): Lengths of the targets. 1-D Tensor of shape `(B,)`.
         blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
 
     Returns:
@@ -2639,6 +2534,9 @@ def forced_align(
 
         where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
         For example, in str `"aabbc"`, the number of repeats are `2`.
+
+    Note:
+        The current version only supports ``batch_size``==1.
     """
     if blank in targets:
         raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
diff --git a/torchaudio/io/_compat.py b/torchaudio/io/_compat.py
index 7b122cbcc8..723b7fcaeb 100644
--- a/torchaudio/io/_compat.py
+++ b/torchaudio/io/_compat.py
@@ -102,7 +102,8 @@ def load_audio_fileobj(
     format: Optional[str] = None,
     buffer_size: int = 4096,
 ) -> Tuple[torch.Tensor, int]:
-    s = torchaudio.lib._torchaudio_ffmpeg.StreamReaderFileObj(src, format, None, buffer_size)
+    demuxer = "ogg" if format == "vorbis" else format
+    s = torchaudio.lib._torchaudio_ffmpeg.StreamReaderFileObj(src, demuxer, None, buffer_size)
     sample_rate = int(s.get_src_stream_info(s.find_best_audio_stream()).sample_rate)
     filter = _get_load_filter(frame_offset, num_frames, convert)
     waveform = _load_audio_fileobj(s, filter, channels_first)
@@ -131,7 +132,7 @@ def _native_endianness() -> str:
         return "be"
 
 
-def _get_encoder_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int) -> str:
+def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
     if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
         raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
     endianness = _native_endianness()
@@ -148,49 +149,93 @@ def _get_encoder_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int
         if bits_per_sample == 8:
             raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
         return f"pcm_s{bits_per_sample}{endianness}"
-    elif encoding == "PCM_U":
+    if encoding == "PCM_U":
         if bits_per_sample in (None, 8):
             return "pcm_u8"
         raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
-    elif encoding == "PCM_F":
+    if encoding == "PCM_F":
         if not bits_per_sample:
             bits_per_sample = 32
         if bits_per_sample in (32, 64):
             return f"pcm_f{bits_per_sample}{endianness}"
         raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
-    elif encoding == "ULAW":
+    if encoding == "ULAW":
         if bits_per_sample in (None, 8):
             return "pcm_mulaw"
         raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
-    elif encoding == "ALAW":
+    if encoding == "ALAW":
         if bits_per_sample in (None, 8):
             return "pcm_alaw"
         raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
     raise ValueError(f"WAV encoding {encoding} is not supported.")
 
 
-def _get_encoder(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int) -> str:
-    if format == "wav":
-        return _get_encoder_for_wav(dtype, encoding, bits_per_sample)
-    if format == "flac":
-        return "flac"
-    if format in ("ogg", "vorbis"):
-        if encoding or bits_per_sample:
-            raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
-        return "vorbis"
-    return format
+def _get_flac_sample_fmt(bps):
+    if bps is None or bps == 16:
+        return "s16"
+    if bps == 24:
+        return "s32"
+    raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
 
 
-def _get_encoder_format(format: str, bits_per_sample: Optional[int]) -> str:
-    if format == "flac":
-        if not bits_per_sample:
-            return "s16"
-        if bits_per_sample == 24:
-            return "s32"
-        if bits_per_sample == 16:
-            return "s16"
-        raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bits_per_sample} specified).")
-    return None
+def _parse_save_args(
+    ext: Optional[str],
+    format: Optional[str],
+    encoding: Optional[str],
+    bps: Optional[int],
+):
+    # torchaudio's save function accepts the followings, which do not 1to1 map
+    # to FFmpeg.
+    #
+    # - format: audio format
+    # - bits_per_sample: encoder sample format
+    # - encoding: such as PCM_U8.
+    #
+    # In FFmpeg, format is specified with the following three (and more)
+    #
+    # - muxer: could be audio format or container format.
+    # the one we passed to the constructor of StreamWriter
+    # - encoder: the audio encoder used to encode audio
+    # - encoder sample format: the format used by encoder to encode audio.
+    #
+    # If encoder sample format is different from source sample format, StreamWriter
+    # will insert a filter automatically.
+    #
+    def _type(spec):
+        # either format is exactly the specified one
+        # or extension matches to the spec AND there is no format override.
+        return format == spec or (format is None and ext == spec)
+
+    if _type("wav") or _type("amb"):
+        # wav is special because it supports different encoding through encoders
+        # each encoder only supports one encoder format
+        #
+        # amb format is a special case originated from libsox.
+        # It is basically a WAV format, with slight modification.
+        # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
+        # It is a format so that decoders will recognize it as ambisonic.
+        # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
+        # FFmpeg does not recognize amb because it is basically a WAV format.
+        muxer = "wav"
+        encoder = _get_encoder_for_wav(encoding, bps)
+        sample_fmt = None
+    elif _type("vorbis"):
+        # FFpmeg does not recognize vorbis extension, while libsox used to do.
+        # For the sake of bakward compatibility, (and the simplicity),
+        # we support the case where users want to do save("foo.vorbis")
+        muxer = "ogg"
+        encoder = "vorbis"
+        sample_fmt = None
+    else:
+        muxer = format
+        encoder = None
+        sample_fmt = None
+        if _type("flac"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+        if _type("ogg"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+    print(ext, format, encoding, bps, "===>", muxer, encoder, sample_fmt)
+    return muxer, encoder, sample_fmt
 
 
 # NOTE: in contrast to load_audio* and info_audio*, this function is NOT compatible with TorchScript.
@@ -204,25 +249,27 @@ def save_audio(
     bits_per_sample: Optional[int] = None,
     buffer_size: int = 4096,
 ) -> None:
+    ext = None
     if hasattr(uri, "write"):
         if format is None:
             raise RuntimeError("'format' is required when saving to file object.")
     else:
         uri = os.path.normpath(uri)
-    s = StreamWriter(uri, format=format, buffer_size=buffer_size)
-    if format is None:
-        tokens = str(uri).split(".")
-        if len(tokens) > 1:
-            format = tokens[-1].lower()
+        if tokens := str(uri).split(".")[1:]:
+            ext = tokens[-1].lower()
+
+    muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
 
     if channels_first:
         src = src.T
+
+    s = StreamWriter(uri, format=muxer, buffer_size=buffer_size)
     s.add_audio_stream(
         sample_rate,
         num_channels=src.size(-1),
         format=_get_sample_format(src.dtype),
-        encoder=_get_encoder(src.dtype, format, encoding, bits_per_sample),
-        encoder_format=_get_encoder_format(format, bits_per_sample),
+        encoder=encoder,
+        encoder_format=enc_fmt,
     )
     with s.open():
         s.write_audio_chunk(0, src)
diff --git a/torchaudio/io/_effector.py b/torchaudio/io/_effector.py
index 38fb0810df..75d4731731 100644
--- a/torchaudio/io/_effector.py
+++ b/torchaudio/io/_effector.py
@@ -260,13 +260,17 @@ def __init__(
         self.codec_config = codec_config
         self.pad_end = pad_end
 
-    def _get_reader(self, waveform, sample_rate, frames_per_chunk=None):
+    def _get_reader(self, waveform, sample_rate, output_sample_rate, frames_per_chunk=None):
         num_frames, num_channels = waveform.shape
 
         if self.format is not None:
             muxer = self.format
             encoder = self.encoder
             option = {}
+            # Some formats are headerless, so need to provide these infomation.
+            if self.format == "mulaw":
+                option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"}
+
         else:  # PCM
             muxer = _get_muxer(waveform.dtype)
             encoder = None
@@ -279,7 +283,8 @@ def _get_reader(self, waveform, sample_rate, frames_per_chunk=None):
                 waveform, sample_rate, self.effect, muxer, encoder, self.codec_config, frames_per_chunk
             )
 
-        filter_desc = _get_afilter_desc(sample_rate, _get_sample_fmt(waveform.dtype), num_channels)
+        output_sr = sample_rate if output_sample_rate is None else output_sample_rate
+        filter_desc = _get_afilter_desc(output_sr, _get_sample_fmt(waveform.dtype), num_channels)
         if self.pad_end:
             filter_desc = f"{filter_desc},apad=whole_len={num_frames}"
 
@@ -287,12 +292,17 @@ def _get_reader(self, waveform, sample_rate, frames_per_chunk=None):
         reader.add_audio_stream(frames_per_chunk or -1, -1, filter_desc=filter_desc)
         return reader
 
-    def apply(self, waveform: Tensor, sample_rate: int) -> Tensor:
+    def apply(self, waveform: Tensor, sample_rate: int, output_sample_rate: Optional[int] = None) -> Tensor:
         """Apply the effect and/or codecs to the whole tensor.
 
         Args:
             waveform (Tensor): The input waveform. Shape: ``(time, channel)``
-            sample_rate (int): Sample rate of the waveform.
+            sample_rate (int): Sample rate of the input waveform.
+            output_sample_rate (int or None, optional): Output sample rate.
+                If provided, override the output sample rate.
+                Otherwise, the resulting tensor is resampled to have
+                the same sample rate as the input.
+                Default: ``None``.
 
         Returns:
             Tensor:
@@ -305,18 +315,25 @@ def apply(self, waveform: Tensor, sample_rate: int) -> Tensor:
         if waveform.numel() == 0:
             return waveform
 
-        reader = self._get_reader(waveform, sample_rate)
+        reader = self._get_reader(waveform, sample_rate, output_sample_rate)
         reader.process_all_packets()
         (applied,) = reader.pop_chunks()
         return Tensor(applied)
 
-    def stream(self, waveform: Tensor, sample_rate: int, frames_per_chunk: int) -> Iterator[Tensor]:
+    def stream(
+        self, waveform: Tensor, sample_rate: int, frames_per_chunk: int, output_sample_rate: Optional[int] = None
+    ) -> Iterator[Tensor]:
         """Apply the effect and/or codecs to the given tensor chunk by chunk.
 
         Args:
             waveform (Tensor): The input waveform. Shape: ``(time, channel)``
             sample_rate (int): Sample rate of the waveform.
             frames_per_chunk (int): The number of frames to return at a time.
+            output_sample_rate (int or None, optional): Output sample rate.
+                If provided, override the output sample rate.
+                Otherwise, the resulting tensor is resampled to have
+                the same sample rate as the input.
+                Default: ``None``.
 
         Returns:
             Iterator[Tensor]:
@@ -330,6 +347,6 @@ def stream(self, waveform: Tensor, sample_rate: int, frames_per_chunk: int) -> I
         if waveform.numel() == 0:
             return waveform
 
-        reader = self._get_reader(waveform, sample_rate, frames_per_chunk)
+        reader = self._get_reader(waveform, sample_rate, output_sample_rate, frames_per_chunk)
         for (applied,) in reader.stream():
             yield Tensor(applied)
diff --git a/torchaudio/models/rnnt_decoder.py b/torchaudio/models/rnnt_decoder.py
index 045e642d0a..5a02b2ca90 100644
--- a/torchaudio/models/rnnt_decoder.py
+++ b/torchaudio/models/rnnt_decoder.py
@@ -109,13 +109,9 @@ def __init__(
 
         self.step_max_tokens = step_max_tokens
 
-    def _init_b_hypos(self, hypo: Optional[Hypothesis], device: torch.device) -> List[Hypothesis]:
-        if hypo is not None:
-            token = _get_hypo_tokens(hypo)[-1]
-            state = _get_hypo_state(hypo)
-        else:
-            token = self.blank
-            state = None
+    def _init_b_hypos(self, device: torch.device) -> List[Hypothesis]:
+        token = self.blank
+        state = None
 
         one_tensor = torch.tensor([1], device=device)
         pred_out, _, pred_state = self.model.predict(torch.tensor([[token]], device=device), one_tensor, state)
@@ -230,14 +226,14 @@ def _gen_new_hypos(
     def _search(
         self,
         enc_out: torch.Tensor,
-        hypo: Optional[Hypothesis],
+        hypo: Optional[List[Hypothesis]],
         beam_width: int,
     ) -> List[Hypothesis]:
         n_time_steps = enc_out.shape[1]
         device = enc_out.device
 
         a_hypos: List[Hypothesis] = []
-        b_hypos = self._init_b_hypos(hypo, device)
+        b_hypos = self._init_b_hypos(device) if hypo is None else hypo
         for t in range(n_time_steps):
             a_hypos = b_hypos
             b_hypos = torch.jit.annotate(List[Hypothesis], [])
@@ -263,7 +259,7 @@ def _search(
                 if a_hypos:
                     symbols_current_t += 1
 
-            _, sorted_idx = torch.tensor([self.hypo_sort_key(hypo) for hypo in b_hypos]).topk(beam_width)
+            _, sorted_idx = torch.tensor([self.hypo_sort_key(hyp) for hyp in b_hypos]).topk(beam_width)
             b_hypos = [b_hypos[idx] for idx in sorted_idx]
 
         return b_hypos
@@ -290,8 +286,8 @@ def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int) ->
 
         if length.shape != () and length.shape != (1,):
             raise ValueError("length must be of shape () or (1,)")
-        if input.dim() == 0:
-            input = input.unsqueeze(0)
+        if length.dim() == 0:
+            length = length.unsqueeze(0)
 
         enc_out, _ = self.model.transcribe(input, length)
         return self._search(enc_out, None, beam_width)
@@ -303,7 +299,7 @@ def infer(
         length: torch.Tensor,
         beam_width: int,
         state: Optional[List[List[torch.Tensor]]] = None,
-        hypothesis: Optional[Hypothesis] = None,
+        hypothesis: Optional[List[Hypothesis]] = None,
     ) -> Tuple[List[Hypothesis], List[List[torch.Tensor]]]:
         r"""Performs beam search for the given input sequence in streaming mode.
 
@@ -318,7 +314,7 @@ def infer(
             state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                 representing transcription network internal state generated in preceding
                 invocation. (Default: ``None``)
-            hypothesis (Hypothesis or None): hypothesis from preceding invocation to seed
+            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                 search with. (Default: ``None``)
 
         Returns:
diff --git a/torchaudio/prototype/functional/__init__.py b/torchaudio/prototype/functional/__init__.py
index 36eae2c0c3..3c08461b70 100644
--- a/torchaudio/prototype/functional/__init__.py
+++ b/torchaudio/prototype/functional/__init__.py
@@ -8,13 +8,14 @@
     sinc_impulse_response,
 )
 from ._rir import simulate_rir_ism
-from .functional import barkscale_fbanks
+from .functional import barkscale_fbanks, chroma_filterbank
 
 
 __all__ = [
     "adsr_envelope",
     "exp_sigmoid",
     "barkscale_fbanks",
+    "chroma_filterbank",
     "extend_pitch",
     "filter_waveform",
     "frequency_impulse_response",
diff --git a/torchaudio/prototype/functional/functional.py b/torchaudio/prototype/functional/functional.py
index e5710620fe..0805a252af 100644
--- a/torchaudio/prototype/functional/functional.py
+++ b/torchaudio/prototype/functional/functional.py
@@ -1,5 +1,6 @@
 import math
 import warnings
+from typing import Optional
 
 import torch
 from torchaudio.functional.functional import _create_triangular_filterbank
@@ -66,6 +67,11 @@ def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.T
     return freqs
 
 
+def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
+    a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
+    return torch.log2(freqs / (a440 / 16))
+
+
 def barkscale_fbanks(
     n_freqs: int,
     f_min: float,
@@ -121,3 +127,64 @@ def barkscale_fbanks(
         )
 
     return fb
+
+
+def chroma_filterbank(
+    sample_rate: int,
+    n_freqs: int,
+    n_chroma: int,
+    *,
+    tuning: float = 0.0,
+    ctroct: float = 5.0,
+    octwidth: Optional[float] = 2.0,
+    norm: int = 2,
+    base_c: bool = True,
+):
+    """Create a frequency-to-chroma conversion matrix. Implementation adapted from librosa.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_freqs (int): Number of input frequencies.
+        n_chroma (int): Number of output chroma.
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Returns:
+        torch.Tensor: Chroma filter bank, with shape `(n_freqs, n_chroma)`.
+    """
+    # Skip redundant upper half of frequency range.
+    freqs = torch.linspace(0, sample_rate // 2, n_freqs)[1:]
+    freq_bins = n_chroma * _hz_to_octs(freqs, bins_per_octave=n_chroma, tuning=tuning)
+    freq_bins = torch.cat((torch.tensor([freq_bins[0] - 1.5 * n_chroma]), freq_bins))
+    freq_bin_widths = torch.cat(
+        (
+            torch.maximum(freq_bins[1:] - freq_bins[:-1], torch.tensor(1.0)),
+            torch.tensor([1]),
+        )
+    )
+
+    # (n_freqs, n_chroma)
+    D = freq_bins.unsqueeze(1) - torch.arange(0, n_chroma)
+
+    n_chroma2 = round(n_chroma / 2)
+
+    # Project to range [-n_chroma/2, n_chroma/2 - 1]
+    D = torch.remainder(D + n_chroma2, n_chroma) - n_chroma2
+
+    fb = torch.exp(-0.5 * (2 * D / torch.tile(freq_bin_widths.unsqueeze(1), (1, n_chroma))) ** 2)
+    fb = torch.nn.functional.normalize(fb, p=norm, dim=1)
+
+    if octwidth is not None:
+        fb *= torch.tile(
+            torch.exp(-0.5 * (((freq_bins.unsqueeze(1) / n_chroma - ctroct) / octwidth) ** 2)),
+            (1, n_chroma),
+        )
+
+    if base_c:
+        fb = torch.roll(fb, -3 * (n_chroma // 12), dims=1)
+
+    return fb
diff --git a/torchaudio/prototype/transforms/__init__.py b/torchaudio/prototype/transforms/__init__.py
index a992c7f057..457f20e119 100644
--- a/torchaudio/prototype/transforms/__init__.py
+++ b/torchaudio/prototype/transforms/__init__.py
@@ -1,7 +1,9 @@
-from ._transforms import BarkScale, BarkSpectrogram, InverseBarkScale
+from ._transforms import BarkScale, BarkSpectrogram, ChromaScale, ChromaSpectrogram, InverseBarkScale
 
 __all__ = [
     "BarkScale",
     "BarkSpectrogram",
+    "ChromaScale",
+    "ChromaSpectrogram",
     "InverseBarkScale",
 ]
diff --git a/torchaudio/prototype/transforms/_transforms.py b/torchaudio/prototype/transforms/_transforms.py
index 3d024bf7b5..9d89cc5339 100644
--- a/torchaudio/prototype/transforms/_transforms.py
+++ b/torchaudio/prototype/transforms/_transforms.py
@@ -1,7 +1,7 @@
 from typing import Callable, Optional
 
 import torch
-from torchaudio.prototype.functional import barkscale_fbanks
+from torchaudio.prototype.functional import barkscale_fbanks, chroma_filterbank
 from torchaudio.transforms import Spectrogram
 
 
@@ -295,3 +295,162 @@ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
         specgram = self.spectrogram(waveform)
         bark_specgram = self.bark_scale(specgram)
         return bark_specgram
+
+
+class ChromaScale(torch.nn.Module):
+    r"""Converts spectrogram to chromagram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_freqs (int): Number of frequency bins in STFT. See ``n_fft`` in :class:`Spectrogram`.
+        n_chroma (int, optional): Number of chroma. (Default: ``12``)
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> spectrogram_transform = transforms.Spectrogram(n_fft=1024)
+        >>> spectrogram = spectrogram_transform(waveform)
+        >>> chroma_transform = transforms.ChromaScale(sample_rate=sample_rate, n_freqs=1024 // 2 + 1)
+        >>> chroma_spectrogram = chroma_transform(spectrogram)
+
+    See also:
+        :py:func:`torchaudio.prototype.functional.chroma_filterbank` — function used to
+        generate the filter bank.
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_freqs: int,
+        *,
+        n_chroma: int = 12,
+        tuning: float = 0.0,
+        ctroct: float = 5.0,
+        octwidth: Optional[float] = 2.0,
+        norm: int = 2,
+        base_c: bool = True,
+    ):
+        super().__init__()
+        fb = chroma_filterbank(
+            sample_rate, n_freqs, n_chroma, tuning=tuning, ctroct=ctroct, octwidth=octwidth, norm=norm, base_c=base_c
+        )
+        self.register_buffer("fb", fb)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            specgram (torch.Tensor): Spectrogram of dimension (..., ``n_freqs``, time).
+
+        Returns:
+            torch.Tensor: Chroma spectrogram of size (..., ``n_chroma``, time).
+        """
+        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)
+
+
+class ChromaSpectrogram(torch.nn.Module):
+    r"""Generates chromagram for audio signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd
+
+    Composes :py:func:`torchaudio.transforms.Spectrogram` and
+    and :py:func:`torchaudio.prototype.transforms.ChromaScale`.
+
+    Args:
+        sample_rate (int): Sample rate of audio signal.
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins.
+        win_length (int or None, optional): Window size. (Default: ``n_fft``)
+        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
+        pad (int, optional): Two sided padding of signal. (Default: ``0``)
+        window_fn (Callable[..., torch.Tensor], optional): A function to create a window tensor
+            that is applied/multiplied to each frame/window. (Default: ``torch.hann_window``)
+        power (float, optional): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: ``2``)
+        normalized (bool, optional): Whether to normalize by magnitude after stft. (Default: ``False``)
+        wkwargs (Dict[..., ...] or None, optional): Arguments for window function. (Default: ``None``)
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            (Default: ``True``)
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. (Default: ``"reflect"``)
+        n_chroma (int, optional): Number of chroma. (Default: ``12``)
+        tuning (float, optional): Tuning deviation from A440 in fractions of a chroma bin. (Default: 0.0)
+        ctroct (float, optional): Center of Gaussian dominance window to weight filters by, in octaves. (Default: 5.0)
+        octwidth (float or None, optional): Width of Gaussian dominance window to weight filters by, in octaves.
+            If ``None``, then disable weighting altogether. (Default: 2.0)
+        norm (int, optional): order of norm to normalize filter bank by. (Default: 2)
+        base_c (bool, optional): If True, then start filter bank at C. Otherwise, start at A. (Default: True)
+
+    Example
+        >>> waveform, sample_rate = torchaudio.load("test.wav", normalize=True)
+        >>> transform = transforms.ChromaSpectrogram(sample_rate=sample_rate, n_fft=400)
+        >>> chromagram = transform(waveform)  # (channel, n_chroma, time)
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        n_fft: int,
+        *,
+        win_length: Optional[int] = None,
+        hop_length: Optional[int] = None,
+        pad: int = 0,
+        window_fn: Callable[..., torch.Tensor] = torch.hann_window,
+        power: float = 2.0,
+        normalized: bool = False,
+        wkwargs: Optional[dict] = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        n_chroma: int = 12,
+        tuning: float = 0.0,
+        ctroct: float = 5.0,
+        octwidth: Optional[float] = 2.0,
+        norm: int = 2,
+        base_c: bool = True,
+    ):
+        super().__init__()
+        self.spectrogram = Spectrogram(
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            pad=pad,
+            window_fn=window_fn,
+            power=power,
+            normalized=normalized,
+            wkwargs=wkwargs,
+            center=center,
+            pad_mode=pad_mode,
+            onesided=True,
+        )
+        self.chroma_scale = ChromaScale(
+            sample_rate,
+            n_fft // 2 + 1,
+            n_chroma=n_chroma,
+            tuning=tuning,
+            base_c=base_c,
+            ctroct=ctroct,
+            octwidth=octwidth,
+            norm=norm,
+        )
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            waveform (Tensor): Tensor of audio of dimension (..., time).
+
+        Returns:
+            Tensor: Chromagram of size (..., ``n_chroma``, time).
+        """
+        spectrogram = self.spectrogram(waveform)
+        chroma_spectrogram = self.chroma_scale(spectrogram)
+        return chroma_spectrogram
diff --git a/torchaudio/sox_effects/sox_effects.py b/torchaudio/sox_effects/sox_effects.py
index e876788df4..c343680b65 100644
--- a/torchaudio/sox_effects/sox_effects.py
+++ b/torchaudio/sox_effects/sox_effects.py
@@ -1,5 +1,4 @@
 import os
-import warnings
 from typing import List, Optional, Tuple
 
 import torch
@@ -156,14 +155,6 @@ def apply_effects_tensor(
     return torch.ops.torchaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first)
 
 
-_deprecation_message = (
-    "File-like object support in sox_io backend is deprecated, "
-    "and will be removed in v2.1. "
-    "See https://github.com/pytorch/audio/issues/2950 for the detail."
-    "Please migrate to the new dispatcher, or use soundfile backend."
-)
-
-
 @torchaudio._extension.fail_if_no_sox
 def apply_effects_file(
     path: str,
@@ -187,18 +178,8 @@ def apply_effects_file(
         rate and leave samples untouched.
 
     Args:
-        path (path-like object or file-like object):
-            Source of audio data. When the function is not compiled by TorchScript,
-            (e.g. ``torch.jit.script``), the following types are accepted:
-
-                  * ``path-like``: file path
-                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
-                    which returns byte string of at most ``size`` length.
-
-            When the function is compiled by TorchScript, only ``str`` type is allowed.
-
-            Note: This argument is intentionally annotated as ``str`` only for
-            TorchScript compiler compatibility.
+        path (path-like object):
+            Source of audio data.
         effects (List[List[str]]): List of effects.
         normalize (bool, optional):
             When ``True``, this function converts the native sample type to ``float32``.
@@ -283,11 +264,10 @@ def apply_effects_file(
     """
     if not torch.jit.is_scripting():
         if hasattr(path, "read"):
-            warnings.warn(_deprecation_message)
-            ret = torchaudio.lib._torchaudio_sox.apply_effects_fileobj(path, effects, normalize, channels_first, format)
-            if ret is None:
-                raise RuntimeError("Failed to load audio from {}".format(path))
-            return ret
+            raise RuntimeError(
+                "apply_effects_file function does not support file-like object. "
+                "Please use torchaudio.io.AudioEffector."
+            )
         path = os.fspath(path)
     ret = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
     if ret is not None:
diff --git a/torchaudio/utils/sox_utils.py b/torchaudio/utils/sox_utils.py
index 384c00bf82..a978e8d1db 100644
--- a/torchaudio/utils/sox_utils.py
+++ b/torchaudio/utils/sox_utils.py
@@ -4,7 +4,6 @@
 
 from typing import Dict, List
 
-import torch
 import torchaudio
 
 
@@ -18,7 +17,7 @@ def set_seed(seed: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_seed(seed)
+    torchaudio.lib._torchaudio_sox.set_seed(seed)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -36,7 +35,7 @@ def set_verbosity(verbosity: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_verbosity(verbosity)
+    torchaudio.lib._torchaudio_sox.set_verbosity(verbosity)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -49,7 +48,7 @@ def set_buffer_size(buffer_size: int):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_buffer_size(buffer_size)
+    torchaudio.lib._torchaudio_sox.set_buffer_size(buffer_size)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -63,7 +62,7 @@ def set_use_threads(use_threads: bool):
     See Also:
         http://sox.sourceforge.net/sox.html
     """
-    torch.ops.torchaudio.sox_utils_set_use_threads(use_threads)
+    torchaudio.lib._torchaudio_sox.set_use_threads(use_threads)
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -73,7 +72,7 @@ def list_effects() -> Dict[str, str]:
     Returns:
         Dict[str, str]: Mapping from ``effect name`` to ``usage``
     """
-    return dict(torch.ops.torchaudio.sox_utils_list_effects())
+    return dict(torchaudio.lib._torchaudio_sox.list_effects())
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -83,7 +82,7 @@ def list_read_formats() -> List[str]:
     Returns:
         List[str]: List of supported audio formats
     """
-    return torch.ops.torchaudio.sox_utils_list_read_formats()
+    return torchaudio.lib._torchaudio_sox.list_read_formats()
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -93,7 +92,7 @@ def list_write_formats() -> List[str]:
     Returns:
         List[str]: List of supported audio formats
     """
-    return torch.ops.torchaudio.sox_utils_list_write_formats()
+    return torchaudio.lib._torchaudio_sox.list_write_formats()
 
 
 @torchaudio._extension.fail_if_no_sox
@@ -103,4 +102,4 @@ def get_buffer_size() -> int:
     Returns:
         int: size in bytes of buffers used for processing audio.
     """
-    return torch.ops.torchaudio.sox_utils_get_buffer_size()
+    return torchaudio.lib._torchaudio_sox.get_buffer_size()