Skip to content

Commit

Permalink
2023-07-06 nightly release (ca66a1d)
Browse files Browse the repository at this point in the history
  • Loading branch information
chronos_secgrp_pytorch_oss_ci_oncall committed Jul 6, 2023
1 parent 55551d0 commit a02cd4e
Show file tree
Hide file tree
Showing 29 changed files with 510 additions and 938 deletions.
82 changes: 82 additions & 0 deletions .github/workflows/ffmpeg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# This job is not directly related to regular CI pipeline.
# It is intended to create FFmpeg binaries that we upload on S3,
# which then will be used during all the build process in CI or local.
#
# This job does not include uploading part.
# Upload needs to be done manually, and it should be done only once
# par new major release of FFmepg.
name: FFmpeg Binaries

on:
workflow_dispatch:
schedule:
- cron: '0 0 * * 0' # on sunday

jobs:
Linux-LGPL:
strategy:
fail-fast: false
matrix:
ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
job-name: Build LGPL FFmpeg for Linux
upload-artifact: ffmpeg-linux-lgpl
repository: pytorch/audio
script: |
export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
./packaging/ffmpeg/build.sh
cd "${FFMPEG_ROOT}/.."
tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux/"
mkdir -p "${artifact_dir}"
mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
macOS-LGPL:
strategy:
fail-fast: false
matrix:
ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
runner: ["macos-m1-12", "macos-12"]
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
job-name: Build LGPL FFmpeg for macOS ("${{ matrix.runner }}")
upload-artifact: ffmpeg-macos-lgpl
repository: pytorch/audio
runner: "${{ matrix.runner }}"
script: |
export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
./packaging/ffmpeg/build.sh
cd "${FFMPEG_ROOT}/.."
tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/macos_$(uname -m)"
mkdir -p "${artifact_dir}"
mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
Windows-LGPL:
strategy:
fail-fast: false
matrix:
ffmpeg_version: ["4.1.8", "5.0.3", "6.0"]
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
with:
job-name: Build LGPL FFmpeg for Windows
upload-artifact: ffmpeg-windows-lgpl
repository: pytorch/audio
script: |
export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}"
export FFMPEG_ROOT="${PWD}/third_party/ffmpeg"
./packaging/ffmpeg/build.bat
cd "${FFMPEG_ROOT}/.."
tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/bin
artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/windows"
mkdir -p "${artifact_dir}"
mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ else()
message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.")
endif()

add_subdirectory(third_party)
add_subdirectory(torchaudio/csrc)
if (BUILD_SOX)
add_subdirectory(third_party/sox)
add_subdirectory(torchaudio/csrc/sox)
endif()
if (USE_FFMPEG)
Expand Down
38 changes: 19 additions & 19 deletions examples/tutorials/ctc_forced_alignment_api_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
emissions, _ = model(waveform.to(device))
emissions = torch.log_softmax(emissions, dim=-1)

emission = emissions[0].cpu().detach()
emission = emissions.cpu().detach()
dictionary = {c: i for i, c in enumerate(labels)}

print(dictionary)
Expand All @@ -107,7 +107,7 @@
# ^^^^^^^^^^^^^
#

plt.imshow(emission.T)
plt.imshow(emission[0].T)
plt.colorbar()
plt.title("Frame-wise class probabilities")
plt.xlabel("Time")
Expand Down Expand Up @@ -205,27 +205,27 @@ def compute_alignments(transcript, dictionary, emission):
frames = []
tokens = [dictionary[c] for c in transcript.replace(" ", "")]

targets = torch.tensor(tokens, dtype=torch.int32)
input_lengths = torch.tensor(emission.shape[0])
target_lengths = torch.tensor(targets.shape[0])
targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
input_lengths = torch.tensor([emission.shape[1]])
target_lengths = torch.tensor([targets.shape[1]])

# This is the key step, where we call the forced alignment API functional.forced_align to compute alignments.
frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)

assert len(frame_alignment) == input_lengths.item()
assert len(targets) == target_lengths.item()
assert frame_alignment.shape[1] == input_lengths[0].item()
assert targets.shape[1] == target_lengths[0].item()

token_index = -1
prev_hyp = 0
for i in range(len(frame_alignment)):
if frame_alignment[i].item() == 0:
for i in range(frame_alignment.shape[1]):
if frame_alignment[0][i].item() == 0:
prev_hyp = 0
continue

if frame_alignment[i].item() != prev_hyp:
if frame_alignment[0][i].item() != prev_hyp:
token_index += 1
frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
prev_hyp = frame_alignment[i].item()
frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
prev_hyp = frame_alignment[0][i].item()
return frames, frame_alignment, frame_scores


Expand Down Expand Up @@ -390,7 +390,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
plt.rcParams.update({"font.size": 30})

# The original waveform
ratio = waveform.size(0) / input_lengths
ratio = waveform.size(1) / input_lengths
ax2.plot(waveform)
ax2.set_ylim(-1.0 * scale, 1.0 * scale)
ax2.set_xlim(0, waveform.size(-1))
Expand All @@ -414,8 +414,8 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
plot_alignments(
segments,
word_segments,
waveform[0],
emission.shape[0],
waveform,
emission.shape[1],
1,
)
plt.show()
Expand All @@ -428,7 +428,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
# `IPython.display.Audio` has to be the last call in a cell,
# and there should be only one call par cell.
def display_segment(i, waveform, word_segments, frame_alignment):
ratio = waveform.size(1) / len(frame_alignment)
ratio = waveform.size(1) / frame_alignment.size(1)
word = word_segments[i]
x0 = int(ratio * word.start)
x1 = int(ratio * word.end)
Expand Down Expand Up @@ -511,19 +511,19 @@ def display_segment(i, waveform, word_segments, frame_alignment):
# Append the extra dimension corresponding to the <star> token
extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
emissions = torch.cat((emissions.cpu(), extra_dim), 2)
emission = emissions[0].detach()
emission = emissions.detach()

# Extend the dictionary to include the <star> token.
dictionary["*"] = 29

assert len(dictionary) == emission.shape[1]
assert len(dictionary) == emission.shape[2]


def compute_and_plot_alignments(transcript, dictionary, emission, waveform):
frames, frame_alignment, _ = compute_alignments(transcript, dictionary, emission)
segments = merge_repeats(frames, transcript)
word_segments = merge_words(transcript, segments, "|")
plot_alignments(segments, word_segments, waveform[0], emission.shape[0], 1)
plot_alignments(segments, word_segments, waveform, emission.shape[1], 1)
plt.show()
return word_segments, frame_alignment

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,27 +90,27 @@ def compute_alignments(transcript, dictionary, emission):
frames = []
tokens = [dictionary[c] for c in transcript.replace(" ", "")]

targets = torch.tensor(tokens, dtype=torch.int32)
input_lengths = torch.tensor(emission.shape[0])
target_lengths = torch.tensor(targets.shape[0])
targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0)
input_lengths = torch.tensor([emission.shape[1]])
target_lengths = torch.tensor([targets.shape[1]])

# This is the key step, where we call the forced alignment API functional.forced_align to compute frame alignments.
frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0)

assert len(frame_alignment) == input_lengths.item()
assert len(targets) == target_lengths.item()
assert frame_alignment.shape[1] == input_lengths[0].item()
assert targets.shape[1] == target_lengths[0].item()

token_index = -1
prev_hyp = 0
for i in range(len(frame_alignment)):
if frame_alignment[i].item() == 0:
for i in range(frame_alignment.shape[1]):
if frame_alignment[0][i].item() == 0:
prev_hyp = 0
continue

if frame_alignment[i].item() != prev_hyp:
if frame_alignment[0][i].item() != prev_hyp:
token_index += 1
frames.append(Frame(token_index, i, frame_scores[i].exp().item()))
prev_hyp = frame_alignment[i].item()
frames.append(Frame(token_index, i, frame_scores[0][i].exp().item()))
prev_hyp = frame_alignment[0][i].item()

# compute frame alignments from token alignments
transcript_nospace = transcript.replace(" ", "")
Expand Down Expand Up @@ -150,7 +150,7 @@ def compute_alignments(transcript, dictionary, emission):
i2 += 1
i3 += 1

num_frames = len(frame_alignment)
num_frames = frame_alignment.shape[1]
return segments, words, num_frames


Expand All @@ -160,7 +160,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10):
plt.rcParams.update({"font.size": 30})

# The original waveform
ratio = waveform.size(0) / input_lengths
ratio = waveform.size(1) / input_lengths
ax2.plot(waveform)
ax2.set_ylim(-1.0 * scale, 1.0 * scale)
ax2.set_xlim(0, waveform.size(-1))
Expand Down Expand Up @@ -249,12 +249,12 @@ def get_emission(waveform):

emissions, _ = model(waveform)
emissions = torch.log_softmax(emissions, dim=-1)
emission = emissions[0].cpu().detach()
emission = emissions.cpu().detach()

# Append the extra dimension corresponding to the <star> token
extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1)
emissions = torch.cat((emissions.cpu(), extra_dim), 2)
emission = emissions[0].detach()
emission = emissions.detach()
return emission, waveform


Expand Down Expand Up @@ -347,12 +347,12 @@ def get_emission(waveform):
waveform, _ = torchaudio.load(speech_file)

emission, waveform = get_emission(waveform)
assert len(dictionary) == emission.shape[1]
assert len(dictionary) == emission.shape[2]

transcript = text_normalized

segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
plot_alignments(segments, word_segments, waveform, emission.shape[1])

print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
Expand Down Expand Up @@ -482,13 +482,14 @@ def get_emission(waveform):
text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian"
speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False)
waveform, _ = torchaudio.load(speech_file)
waveform = waveform[0:1]

emission, waveform = get_emission(waveform)

transcript = text_normalized

segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
plot_alignments(segments, word_segments, waveform, emission.shape[1])

print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
Expand Down Expand Up @@ -557,7 +558,7 @@ def get_emission(waveform):
transcript = text_normalized

segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
plot_alignments(segments, word_segments, waveform, emission.shape[1])

print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
Expand Down Expand Up @@ -660,7 +661,7 @@ def get_emission(waveform):
transcript = text_normalized

segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
plot_alignments(segments, word_segments, waveform, emission.shape[1])

print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
Expand Down Expand Up @@ -785,7 +786,7 @@ def get_emission(waveform):
transcript = text_normalized

segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission)
plot_alignments(segments, word_segments, waveform[0], emission.shape[0])
plot_alignments(segments, word_segments, waveform, emission.shape[1])

print("Raw Transcript: ", text_raw)
print("Normalized Transcript: ", text_normalized)
Expand Down
32 changes: 26 additions & 6 deletions packaging/ffmpeg/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ if [[ "$OSTYPE" == "msys" ]]; then
args="--toolchain=msvc"
fi

archive="https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n${FFMPEG_VERSION:-4.1.8}.tar.gz"

build_dir=$(mktemp -d -t ffmpeg-build.XXXXXXXXXX)
cleanup() {
rm -rf "${build_dir}"
Expand All @@ -32,7 +34,7 @@ cd "${build_dir}"
# NOTE:
# When changing the version of FFmpeg, update the README so that the link to the source points
# the same version.
curl -LsS -o ffmpeg.tar.gz https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n4.1.8.tar.gz
curl -LsS -o ffmpeg.tar.gz "${archive}"
tar -xf ffmpeg.tar.gz --strip-components 1
./configure \
--prefix="${prefix}" \
Expand Down Expand Up @@ -72,11 +74,29 @@ ls ${prefix}/*
# macOS: Fix rpath so that the libraries are searched dynamically in user environment.
# In Linux, this is handled by `--enable-rpath` flag.
if [[ "$(uname)" == Darwin ]]; then
avcodec=libavcodec.58
avdevice=libavdevice.58
avfilter=libavfilter.7
avformat=libavformat.58
avutil=libavutil.56
major_ver=${FFMPEG_VERSION:0:1}
if [[ ${major_ver} == 4 ]]; then
avutil=libavutil.56
avcodec=libavcodec.58
avformat=libavformat.58
avdevice=libavdevice.58
avfilter=libavfilter.7
elif [[ ${major_ver} == 5 ]]; then
avutil=libavutil.57
avcodec=libavcodec.59
avformat=libavformat.59
avdevice=libavdevice.59
avfilter=libavfilter.8
elif [[ ${major_ver} == 6 ]]; then
avutil=libavutil.58
avcodec=libavcodec.60
avformat=libavformat.60
avdevice=libavdevice.60
avfilter=libavfilter.9
else
printf "Error: unexpected FFmpeg major version: %s\n" ${major_ver}
exit 1;
fi

otool="/usr/bin/otool"
# NOTE: miniconda has a version of otool and install_name_tool installed and we want
Expand Down
Loading

0 comments on commit a02cd4e

Please sign in to comment.