2023-07-06 nightly release (ca66a1d)

pytorch · Jul 6, 2023 · a02cd4e · a02cd4e
1 parent 55551d0
commit a02cd4e
Show file tree

Hide file tree

Showing 29 changed files with 510 additions and 938 deletions.
diff --git a/.github/workflows/ffmpeg.yml b/.github/workflows/ffmpeg.yml
@@ -0,0 +1,82 @@
+# This job is not directly related to regular CI pipeline.
+# It is intended to create FFmpeg binaries that we upload on S3,
+# which then will be used during all the build process in CI or local.
+#
+# This job does not include uploading part.
+# Upload needs to be done manually, and it should be done only once
+# par new major release of FFmepg.
+name: FFmpeg Binaries
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0'  # on sunday
+
+jobs:
+  Linux-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for Linux
+      upload-artifact: ffmpeg-linux-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux/"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  macOS-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+        runner: ["macos-m1-12", "macos-12"]
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for macOS ("${{ matrix.runner }}")
+      upload-artifact: ffmpeg-macos-lgpl
+      repository: pytorch/audio
+      runner: "${{ matrix.runner }}"
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.sh
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/macos_$(uname -m)"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
+  Windows-LGPL:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      job-name: Build LGPL FFmpeg for Windows
+      upload-artifact: ffmpeg-windows-lgpl
+      repository: pytorch/audio
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
+        export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
+        ./packaging/ffmpeg/build.bat
+
+        cd "${FFMPEG_ROOT}/.."
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/bin
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/windows"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -165,9 +165,9 @@ else()
   message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
 endif()
 
-add_subdirectory(third_party)
 add_subdirectory(torchaudio/csrc)
 if (BUILD_SOX)
+  add_subdirectory(third_party/sox)
   add_subdirectory(torchaudio/csrc/sox)
 endif()
 if (USE_FFMPEG)

diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py
@@ -96,7 +96,7 @@
     emissions, _ = model(waveform.to(device))
     emissions = torch.log_softmax(emissions, dim=-1)
 
-emission = emissions[0].cpu().detach()
+emission = emissions.cpu().detach()
 dictionary = {c: i for i, c in enumerate(labels)}
 
 print(dictionary)
@@ -107,7 +107,7 @@
 # ^^^^^^^^^^^^^
 #
 
-plt.imshow(emission.T)
+plt.imshow(emission[0].T)
 plt.colorbar()
 plt.title("Frame-wise class probabilities")
 plt.xlabel("Time")
@@ -205,27 +205,27 @@ def compute_alignments(transcript, dictionary, emission):
     frames = []
     tokens = [dictionary[c] for c in transcript.replace(" ", "")]
 
-    targets = torch.tensor(tokens, dtype=torch.int32)
-    input_lengths = torch.tensor(emission.shape[0])
-    target_lengths = torch.tensor(targets.shape[0])
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
 
     # This is the key step, where we call the forced alignment API functional.forced_align to compute alignments.
     frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
 
-    assert len(frame_alignment) == input_lengths.item()
-    assert len(targets) == target_lengths.item()
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
 
     token_index = -1
     prev_hyp = 0
-    for i in range(len(frame_alignment)):
-        if frame_alignment[i].item() == 0:
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
             prev_hyp = 0
             continue
 
-        if frame_alignment[i].item() != prev_hyp:
+        if frame_alignment[0][i].item() != prev_hyp:
             token_index += 1
-        frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
-        prev_hyp = frame_alignment[i].item()
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
     return frames, frame_alignment, frame_scores
 
 
@@ -390,7 +390,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     plt.rcParams.update({"font.size": 30})
 
     # The original waveform
-    ratio = waveform.size(0) / input_lengths
+    ratio = waveform.size(1) / input_lengths
     ax2.plot(waveform)
     ax2.set_ylim(-1.0 * scale, 1.0 * scale)
     ax2.set_xlim(0, waveform.size(-1))
@@ -414,8 +414,8 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
 plot_alignments(
     segments,
     word_segments,
-    waveform[0],
-    emission.shape[0],
+    waveform,
+    emission.shape[1],
     1,
 )
 plt.show()
@@ -428,7 +428,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
 # `IPython.display.Audio` has to be the last call in a cell,
 # and there should be only one call par cell.
 def display_segment(i, waveform, word_segments, frame_alignment):
-    ratio = waveform.size(1) / len(frame_alignment)
+    ratio = waveform.size(1) / frame_alignment.size(1)
     word = word_segments[i]
     x0 = int(ratio * word.start)
     x1 = int(ratio * word.end)
@@ -511,19 +511,19 @@ def display_segment(i, waveform, word_segments, frame_alignment):
     # Append the extra dimension corresponding to the <star> token
     extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
     emissions = torch.cat((emissions.cpu(), extra_dim), 2)
-    emission = emissions[0].detach()
+    emission = emissions.detach()
 
 # Extend the dictionary to include the <star> token.
 dictionary["*"] = 29
 
-assert len(dictionary) == emission.shape[1]
+assert len(dictionary) == emission.shape[2]
 
 
 def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
     frames, frame_alignment, _ = compute_alignments(transcript, dictionary, emission)
     segments = merge_repeats(frames, transcript)
     word_segments = merge_words(transcript, segments, "|")
-    plot_alignments(segments, word_segments, waveform[0], emission.shape[0], 1)
+    plot_alignments(segments, word_segments, waveform, emission.shape[1], 1)
     plt.show()
     return word_segments, frame_alignment
 

diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py
@@ -90,27 +90,27 @@ def compute_alignments(transcript, dictionary, emission):
     frames = []
     tokens = [dictionary[c] for c in transcript.replace(" ", "")]
 
-    targets = torch.tensor(tokens, dtype=torch.int32)
-    input_lengths = torch.tensor(emission.shape[0])
-    target_lengths = torch.tensor(targets.shape[0])
+    targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
+    input_lengths = torch.tensor([emission.shape[1]])
+    target_lengths = torch.tensor([targets.shape[1]])
 
     # This is the key step, where we call the forced alignment API functional.forced_align to compute frame alignments.
     frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)
 
-    assert len(frame_alignment) == input_lengths.item()
-    assert len(targets) == target_lengths.item()
+    assert frame_alignment.shape[1] == input_lengths[0].item()
+    assert targets.shape[1] == target_lengths[0].item()
 
     token_index = -1
     prev_hyp = 0
-    for i in range(len(frame_alignment)):
-        if frame_alignment[i].item() == 0:
+    for i in range(frame_alignment.shape[1]):
+        if frame_alignment[0][i].item() == 0:
             prev_hyp = 0
             continue
 
-        if frame_alignment[i].item() != prev_hyp:
+        if frame_alignment[0][i].item() != prev_hyp:
             token_index += 1
-        frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
-        prev_hyp = frame_alignment[i].item()
+        frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
+        prev_hyp = frame_alignment[0][i].item()
 
     # compute frame alignments from token alignments
     transcript_nospace = transcript.replace(" ", "")
@@ -150,7 +150,7 @@ def compute_alignments(transcript, dictionary, emission):
             i2 += 1
         i3 += 1
 
-    num_frames = len(frame_alignment)
+    num_frames = frame_alignment.shape[1]
     return segments, words, num_frames
 
 
@@ -160,7 +160,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
     plt.rcParams.update({"font.size": 30})
 
     # The original waveform
-    ratio = waveform.size(0) / input_lengths
+    ratio = waveform.size(1) / input_lengths
     ax2.plot(waveform)
     ax2.set_ylim(-1.0 * scale, 1.0 * scale)
     ax2.set_xlim(0, waveform.size(-1))
@@ -249,12 +249,12 @@ def get_emission(waveform):
 
     emissions, _ = model(waveform)
     emissions = torch.log_softmax(emissions, dim=-1)
-    emission = emissions[0].cpu().detach()
+    emission = emissions.cpu().detach()
 
     # Append the extra dimension corresponding to the <star> token
     extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
     emissions = torch.cat((emissions.cpu(), extra_dim), 2)
-    emission = emissions[0].detach()
+    emission = emissions.detach()
     return emission, waveform
 
 
@@ -347,12 +347,12 @@ def get_emission(waveform):
 waveform, _ = torchaudio.load(speech_file)
 
 emission, waveform = get_emission(waveform)
-assert len(dictionary) == emission.shape[1]
+assert len(dictionary) == emission.shape[2]
 
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -482,13 +482,14 @@ def get_emission(waveform):
 text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
 speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False)
 waveform, _ = torchaudio.load(speech_file)
+waveform = waveform[0:1]
 
 emission, waveform = get_emission(waveform)
 
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -557,7 +558,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -660,7 +661,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)
@@ -785,7 +786,7 @@ def get_emission(waveform):
 transcript = text_normalized
 
 segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
-plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
+plot_alignments(segments, word_segments, waveform, emission.shape[1])
 
 print("Raw Transcript: ", text_raw)
 print("Normalized Transcript: ", text_normalized)

diff --git a/packaging/ffmpeg/build.sh b/packaging/ffmpeg/build.sh
@@ -21,6 +21,8 @@ if [[ "$OSTYPE" == "msys" ]]; then
    args="--toolchain=msvc"
 fi
 
+archive="https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n${FFMPEG_VERSION:-4.1.8}.tar.gz"
+
 build_dir=$(mktemp -d -t ffmpeg-build.XXXXXXXXXX)
 cleanup() {
     rm -rf "${build_dir}"
@@ -32,7 +34,7 @@ cd "${build_dir}"
 # NOTE:
 # When changing the version of FFmpeg, update the README so that the link to the source points
 # the same version.
-curl -LsS -o ffmpeg.tar.gz https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n4.1.8.tar.gz
+curl -LsS -o ffmpeg.tar.gz "${archive}"
 tar -xf ffmpeg.tar.gz --strip-components 1
 ./configure \
     --prefix="${prefix}" \
@@ -72,11 +74,29 @@ ls ${prefix}/*
 # macOS: Fix rpath so that the libraries are searched dynamically in user environment.
 # In Linux, this is handled by `--enable-rpath` flag.
 if [[ "$(uname)" == Darwin ]]; then
-    avcodec=libavcodec.58
-    avdevice=libavdevice.58
-    avfilter=libavfilter.7
-    avformat=libavformat.58
-    avutil=libavutil.56
+    major_ver=${FFMPEG_VERSION:0:1}
+    if [[ ${major_ver} == 4 ]]; then
+        avutil=libavutil.56
+        avcodec=libavcodec.58
+        avformat=libavformat.58
+        avdevice=libavdevice.58
+        avfilter=libavfilter.7
+    elif [[ ${major_ver} == 5 ]]; then
+        avutil=libavutil.57
+        avcodec=libavcodec.59
+        avformat=libavformat.59
+        avdevice=libavdevice.59
+        avfilter=libavfilter.8
+    elif [[ ${major_ver} == 6 ]]; then
+        avutil=libavutil.58
+        avcodec=libavcodec.60
+        avformat=libavformat.60
+        avdevice=libavdevice.60
+        avfilter=libavfilter.9
+    else
+        printf "Error: unexpected FFmpeg major version: %s\n"  ${major_ver}
+        exit 1;
+    fi
 
     otool="/usr/bin/otool"
     # NOTE: miniconda has a version of otool and install_name_tool installed and we want