From a02cd4e65443a691e752ed4ac815d34590d4f233 Mon Sep 17 00:00:00 2001 From: chronos_secgrp_pytorch_oss_ci_oncall Date: Thu, 6 Jul 2023 04:30:47 -0700 Subject: [PATCH] 2023-07-06 nightly release (ca66a1d3a0a000031ee0927c17c5b223dc119077) --- .github/workflows/ffmpeg.yml | 82 +++++ CMakeLists.txt | 2 +- .../ctc_forced_alignment_api_tutorial.py | 38 +-- ...lignment_for_multilingual_data_tutorial.py | 41 +-- packaging/ffmpeg/build.sh | 32 +- .../functional/functional_impl.py | 85 ++--- third_party/CMakeLists.txt | 11 - tools/setup_helpers/extension.py | 2 - torchaudio/csrc/ffmpeg/CMakeLists.txt | 18 +- torchaudio/csrc/ffmpeg/ffmpeg.cpp | 41 +-- torchaudio/csrc/ffmpeg/ffmpeg.h | 5 +- torchaudio/csrc/ffmpeg/filter_graph.cpp | 40 ++- torchaudio/csrc/ffmpeg/hw_context.cpp | 3 +- torchaudio/csrc/ffmpeg/pybind/pybind.cpp | 50 ++- .../csrc/ffmpeg/stream_reader/conversion.cpp | 13 +- .../ffmpeg/stream_reader/packet_buffer.cpp | 4 +- .../ffmpeg/stream_reader/post_process.cpp | 37 +-- .../ffmpeg/stream_reader/stream_processor.cpp | 43 ++- .../ffmpeg/stream_reader/stream_reader.cpp | 47 ++- .../ffmpeg/stream_writer/encode_process.cpp | 94 +++--- .../csrc/ffmpeg/stream_writer/encoder.cpp | 12 +- .../ffmpeg/stream_writer/packet_writer.cpp | 14 +- .../ffmpeg/stream_writer/stream_writer.cpp | 32 +- .../ffmpeg/stream_writer/tensor_converter.cpp | 28 +- torchaudio/csrc/ffmpeg/stub.cpp | 196 ----------- torchaudio/csrc/ffmpeg/stub.h | 313 ------------------ torchaudio/csrc/forced_align/cpu/compute.cpp | 71 ++-- torchaudio/csrc/forced_align/gpu/compute.cu | 83 +++-- torchaudio/functional/functional.py | 11 +- 29 files changed, 510 insertions(+), 938 deletions(-) create mode 100644 .github/workflows/ffmpeg.yml delete mode 100644 third_party/CMakeLists.txt delete mode 100644 torchaudio/csrc/ffmpeg/stub.cpp delete mode 100644 torchaudio/csrc/ffmpeg/stub.h diff --git a/.github/workflows/ffmpeg.yml b/.github/workflows/ffmpeg.yml new file mode 100644 index 0000000000..cc39217e65 --- /dev/null +++ b/.github/workflows/ffmpeg.yml @@ -0,0 +1,82 @@ +# This job is not directly related to regular CI pipeline. +# It is intended to create FFmpeg binaries that we upload on S3, +# which then will be used during all the build process in CI or local. +# +# This job does not include uploading part. +# Upload needs to be done manually, and it should be done only once +# par new major release of FFmepg. +name: FFmpeg Binaries + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' # on sunday + +jobs: + Linux-LGPL: + strategy: + fail-fast: false + matrix: + ffmpeg_version: ["4.1.8", "5.0.3", "6.0"] + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + job-name: Build LGPL FFmpeg for Linux + upload-artifact: ffmpeg-linux-lgpl + repository: pytorch/audio + script: | + export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}" + export FFMPEG_ROOT="${PWD}/third_party/ffmpeg" + ./packaging/ffmpeg/build.sh + + cd "${FFMPEG_ROOT}/.." + tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib + + artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux/" + mkdir -p "${artifact_dir}" + mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz" + + macOS-LGPL: + strategy: + fail-fast: false + matrix: + ffmpeg_version: ["4.1.8", "5.0.3", "6.0"] + runner: ["macos-m1-12", "macos-12"] + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + job-name: Build LGPL FFmpeg for macOS ("${{ matrix.runner }}") + upload-artifact: ffmpeg-macos-lgpl + repository: pytorch/audio + runner: "${{ matrix.runner }}" + script: | + export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}" + export FFMPEG_ROOT="${PWD}/third_party/ffmpeg" + ./packaging/ffmpeg/build.sh + + cd "${FFMPEG_ROOT}/.." + tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib + + artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/macos_$(uname -m)" + mkdir -p "${artifact_dir}" + mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz" + + Windows-LGPL: + strategy: + fail-fast: false + matrix: + ffmpeg_version: ["4.1.8", "5.0.3", "6.0"] + uses: pytorch/test-infra/.github/workflows/windows_job.yml@main + with: + job-name: Build LGPL FFmpeg for Windows + upload-artifact: ffmpeg-windows-lgpl + repository: pytorch/audio + script: | + export FFMPEG_VERSION="${{ matrix.ffmpeg_version }}" + export FFMPEG_ROOT="${PWD}/third_party/ffmpeg" + ./packaging/ffmpeg/build.bat + + cd "${FFMPEG_ROOT}/.." + tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/bin + + artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/windows" + mkdir -p "${artifact_dir}" + mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz" diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e45a134ed..7649c92050 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,9 +165,9 @@ else() message(STATUS "Could not find ccache. Consider installing ccache to speed up compilation.") endif() -add_subdirectory(third_party) add_subdirectory(torchaudio/csrc) if (BUILD_SOX) + add_subdirectory(third_party/sox) add_subdirectory(torchaudio/csrc/sox) endif() if (USE_FFMPEG) diff --git a/examples/tutorials/ctc_forced_alignment_api_tutorial.py b/examples/tutorials/ctc_forced_alignment_api_tutorial.py index be8fc27d7a..a0d3d7acb7 100644 --- a/examples/tutorials/ctc_forced_alignment_api_tutorial.py +++ b/examples/tutorials/ctc_forced_alignment_api_tutorial.py @@ -96,7 +96,7 @@ emissions, _ = model(waveform.to(device)) emissions = torch.log_softmax(emissions, dim=-1) -emission = emissions[0].cpu().detach() +emission = emissions.cpu().detach() dictionary = {c: i for i, c in enumerate(labels)} print(dictionary) @@ -107,7 +107,7 @@ # ^^^^^^^^^^^^^ # -plt.imshow(emission.T) +plt.imshow(emission[0].T) plt.colorbar() plt.title("Frame-wise class probabilities") plt.xlabel("Time") @@ -205,27 +205,27 @@ def compute_alignments(transcript, dictionary, emission): frames = [] tokens = [dictionary[c] for c in transcript.replace(" ", "")] - targets = torch.tensor(tokens, dtype=torch.int32) - input_lengths = torch.tensor(emission.shape[0]) - target_lengths = torch.tensor(targets.shape[0]) + targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0) + input_lengths = torch.tensor([emission.shape[1]]) + target_lengths = torch.tensor([targets.shape[1]]) # This is the key step, where we call the forced alignment API functional.forced_align to compute alignments. frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0) - assert len(frame_alignment) == input_lengths.item() - assert len(targets) == target_lengths.item() + assert frame_alignment.shape[1] == input_lengths[0].item() + assert targets.shape[1] == target_lengths[0].item() token_index = -1 prev_hyp = 0 - for i in range(len(frame_alignment)): - if frame_alignment[i].item() == 0: + for i in range(frame_alignment.shape[1]): + if frame_alignment[0][i].item() == 0: prev_hyp = 0 continue - if frame_alignment[i].item() != prev_hyp: + if frame_alignment[0][i].item() != prev_hyp: token_index += 1 - frames.append(Frame(token_index, i, frame_scores[i].exp().item())) - prev_hyp = frame_alignment[i].item() + frames.append(Frame(token_index, i, frame_scores[0][i].exp().item())) + prev_hyp = frame_alignment[0][i].item() return frames, frame_alignment, frame_scores @@ -390,7 +390,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10): plt.rcParams.update({"font.size": 30}) # The original waveform - ratio = waveform.size(0) / input_lengths + ratio = waveform.size(1) / input_lengths ax2.plot(waveform) ax2.set_ylim(-1.0 * scale, 1.0 * scale) ax2.set_xlim(0, waveform.size(-1)) @@ -414,8 +414,8 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10): plot_alignments( segments, word_segments, - waveform[0], - emission.shape[0], + waveform, + emission.shape[1], 1, ) plt.show() @@ -428,7 +428,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10): # `IPython.display.Audio` has to be the last call in a cell, # and there should be only one call par cell. def display_segment(i, waveform, word_segments, frame_alignment): - ratio = waveform.size(1) / len(frame_alignment) + ratio = waveform.size(1) / frame_alignment.size(1) word = word_segments[i] x0 = int(ratio * word.start) x1 = int(ratio * word.end) @@ -511,19 +511,19 @@ def display_segment(i, waveform, word_segments, frame_alignment): # Append the extra dimension corresponding to the token extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1) emissions = torch.cat((emissions.cpu(), extra_dim), 2) - emission = emissions[0].detach() + emission = emissions.detach() # Extend the dictionary to include the token. dictionary["*"] = 29 -assert len(dictionary) == emission.shape[1] +assert len(dictionary) == emission.shape[2] def compute_and_plot_alignments(transcript, dictionary, emission, waveform): frames, frame_alignment, _ = compute_alignments(transcript, dictionary, emission) segments = merge_repeats(frames, transcript) word_segments = merge_words(transcript, segments, "|") - plot_alignments(segments, word_segments, waveform[0], emission.shape[0], 1) + plot_alignments(segments, word_segments, waveform, emission.shape[1], 1) plt.show() return word_segments, frame_alignment diff --git a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py index 95251c6198..01333d7175 100644 --- a/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py +++ b/examples/tutorials/forced_alignment_for_multilingual_data_tutorial.py @@ -90,27 +90,27 @@ def compute_alignments(transcript, dictionary, emission): frames = [] tokens = [dictionary[c] for c in transcript.replace(" ", "")] - targets = torch.tensor(tokens, dtype=torch.int32) - input_lengths = torch.tensor(emission.shape[0]) - target_lengths = torch.tensor(targets.shape[0]) + targets = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0) + input_lengths = torch.tensor([emission.shape[1]]) + target_lengths = torch.tensor([targets.shape[1]]) # This is the key step, where we call the forced alignment API functional.forced_align to compute frame alignments. frame_alignment, frame_scores = forced_align(emission, targets, input_lengths, target_lengths, 0) - assert len(frame_alignment) == input_lengths.item() - assert len(targets) == target_lengths.item() + assert frame_alignment.shape[1] == input_lengths[0].item() + assert targets.shape[1] == target_lengths[0].item() token_index = -1 prev_hyp = 0 - for i in range(len(frame_alignment)): - if frame_alignment[i].item() == 0: + for i in range(frame_alignment.shape[1]): + if frame_alignment[0][i].item() == 0: prev_hyp = 0 continue - if frame_alignment[i].item() != prev_hyp: + if frame_alignment[0][i].item() != prev_hyp: token_index += 1 - frames.append(Frame(token_index, i, frame_scores[i].exp().item())) - prev_hyp = frame_alignment[i].item() + frames.append(Frame(token_index, i, frame_scores[0][i].exp().item())) + prev_hyp = frame_alignment[0][i].item() # compute frame alignments from token alignments transcript_nospace = transcript.replace(" ", "") @@ -150,7 +150,7 @@ def compute_alignments(transcript, dictionary, emission): i2 += 1 i3 += 1 - num_frames = len(frame_alignment) + num_frames = frame_alignment.shape[1] return segments, words, num_frames @@ -160,7 +160,7 @@ def plot_alignments(segments, word_segments, waveform, input_lengths, scale=10): plt.rcParams.update({"font.size": 30}) # The original waveform - ratio = waveform.size(0) / input_lengths + ratio = waveform.size(1) / input_lengths ax2.plot(waveform) ax2.set_ylim(-1.0 * scale, 1.0 * scale) ax2.set_xlim(0, waveform.size(-1)) @@ -249,12 +249,12 @@ def get_emission(waveform): emissions, _ = model(waveform) emissions = torch.log_softmax(emissions, dim=-1) - emission = emissions[0].cpu().detach() + emission = emissions.cpu().detach() # Append the extra dimension corresponding to the token extra_dim = torch.zeros(emissions.shape[0], emissions.shape[1], 1) emissions = torch.cat((emissions.cpu(), extra_dim), 2) - emission = emissions[0].detach() + emission = emissions.detach() return emission, waveform @@ -347,12 +347,12 @@ def get_emission(waveform): waveform, _ = torchaudio.load(speech_file) emission, waveform = get_emission(waveform) -assert len(dictionary) == emission.shape[1] +assert len(dictionary) == emission.shape[2] transcript = text_normalized segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission) -plot_alignments(segments, word_segments, waveform[0], emission.shape[0]) +plot_alignments(segments, word_segments, waveform, emission.shape[1]) print("Raw Transcript: ", text_raw) print("Normalized Transcript: ", text_normalized) @@ -482,13 +482,14 @@ def get_emission(waveform): text_normalized = "guan fuwu gaoduan chanpin reng chuyu gongbuyingqiu de jumian" speech_file = torchaudio.utils.download_asset("tutorial-assets/mvdr/clean_speech.wav", progress=False) waveform, _ = torchaudio.load(speech_file) +waveform = waveform[0:1] emission, waveform = get_emission(waveform) transcript = text_normalized segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission) -plot_alignments(segments, word_segments, waveform[0], emission.shape[0]) +plot_alignments(segments, word_segments, waveform, emission.shape[1]) print("Raw Transcript: ", text_raw) print("Normalized Transcript: ", text_normalized) @@ -557,7 +558,7 @@ def get_emission(waveform): transcript = text_normalized segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission) -plot_alignments(segments, word_segments, waveform[0], emission.shape[0]) +plot_alignments(segments, word_segments, waveform, emission.shape[1]) print("Raw Transcript: ", text_raw) print("Normalized Transcript: ", text_normalized) @@ -660,7 +661,7 @@ def get_emission(waveform): transcript = text_normalized segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission) -plot_alignments(segments, word_segments, waveform[0], emission.shape[0]) +plot_alignments(segments, word_segments, waveform, emission.shape[1]) print("Raw Transcript: ", text_raw) print("Normalized Transcript: ", text_normalized) @@ -785,7 +786,7 @@ def get_emission(waveform): transcript = text_normalized segments, word_segments, num_frames = compute_alignments(transcript, dictionary, emission) -plot_alignments(segments, word_segments, waveform[0], emission.shape[0]) +plot_alignments(segments, word_segments, waveform, emission.shape[1]) print("Raw Transcript: ", text_raw) print("Normalized Transcript: ", text_normalized) diff --git a/packaging/ffmpeg/build.sh b/packaging/ffmpeg/build.sh index 4ccdb6924b..7648830336 100755 --- a/packaging/ffmpeg/build.sh +++ b/packaging/ffmpeg/build.sh @@ -21,6 +21,8 @@ if [[ "$OSTYPE" == "msys" ]]; then args="--toolchain=msvc" fi +archive="https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n${FFMPEG_VERSION:-4.1.8}.tar.gz" + build_dir=$(mktemp -d -t ffmpeg-build.XXXXXXXXXX) cleanup() { rm -rf "${build_dir}" @@ -32,7 +34,7 @@ cd "${build_dir}" # NOTE: # When changing the version of FFmpeg, update the README so that the link to the source points # the same version. -curl -LsS -o ffmpeg.tar.gz https://github.com/FFmpeg/FFmpeg/archive/refs/tags/n4.1.8.tar.gz +curl -LsS -o ffmpeg.tar.gz "${archive}" tar -xf ffmpeg.tar.gz --strip-components 1 ./configure \ --prefix="${prefix}" \ @@ -72,11 +74,29 @@ ls ${prefix}/* # macOS: Fix rpath so that the libraries are searched dynamically in user environment. # In Linux, this is handled by `--enable-rpath` flag. if [[ "$(uname)" == Darwin ]]; then - avcodec=libavcodec.58 - avdevice=libavdevice.58 - avfilter=libavfilter.7 - avformat=libavformat.58 - avutil=libavutil.56 + major_ver=${FFMPEG_VERSION:0:1} + if [[ ${major_ver} == 4 ]]; then + avutil=libavutil.56 + avcodec=libavcodec.58 + avformat=libavformat.58 + avdevice=libavdevice.58 + avfilter=libavfilter.7 + elif [[ ${major_ver} == 5 ]]; then + avutil=libavutil.57 + avcodec=libavcodec.59 + avformat=libavformat.59 + avdevice=libavdevice.59 + avfilter=libavfilter.8 + elif [[ ${major_ver} == 6 ]]; then + avutil=libavutil.58 + avcodec=libavcodec.60 + avformat=libavformat.60 + avdevice=libavdevice.60 + avfilter=libavfilter.9 + else + printf "Error: unexpected FFmpeg major version: %s\n" ${major_ver} + exit 1; + fi otool="/usr/bin/otool" # NOTE: miniconda has a version of otool and install_name_tool installed and we want diff --git a/test/torchaudio_unittest/functional/functional_impl.py b/test/torchaudio_unittest/functional/functional_impl.py index d7847c034f..6bb6a9f8bf 100644 --- a/test/torchaudio_unittest/functional/functional_impl.py +++ b/test/torchaudio_unittest/functional/functional_impl.py @@ -1116,55 +1116,60 @@ def test_preemphasis_deemphasis_roundtrip(self, input_shape, coeff): @parameterized.expand( [ - ([0, 1, 1, 0], [0, 1, 5, 1, 0], torch.int32), - ([0, 1, 2, 3, 4], [0, 1, 2, 3, 4], torch.int32), - ([3, 3, 3], [3, 5, 3, 5, 3], torch.int64), - ([0, 1, 2], [0, 1, 1, 1, 2], torch.int64), + ([[0, 1, 1, 0]], [[0, 1, 5, 1, 0]], torch.int32), + ([[0, 1, 2, 3, 4]], [[0, 1, 2, 3, 4]], torch.int32), + ([[3, 3, 3]], [[3, 5, 3, 5, 3]], torch.int64), + ([[0, 1, 2]], [[0, 1, 1, 1, 2]], torch.int64), ] ) def test_forced_align(self, targets, ref_path, targets_dtype): emission = torch.tensor( [ - [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], - [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], - [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], - [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], - [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107], + [ + [0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], + [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], + [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], + [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], + [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107], + ] ], dtype=self.dtype, device=self.device, ) blank = 5 + batch_index = 0 ref_path = torch.tensor(ref_path, dtype=targets_dtype, device=self.device) ref_scores = torch.tensor( - [torch.log(emission[i, ref_path[i]]).item() for i in range(emission.shape[0])], + [torch.log(emission[batch_index, i, ref_path[batch_index, i]]).item() for i in range(emission.shape[1])], dtype=emission.dtype, device=self.device, - ) + ).unsqueeze(0) log_probs = torch.log(emission) targets = torch.tensor(targets, dtype=targets_dtype, device=self.device) - input_lengths = torch.tensor((log_probs.shape[0])) - target_lengths = torch.tensor((targets.shape[0])) + input_lengths = torch.tensor([log_probs.shape[1]], device=self.device) + target_lengths = torch.tensor([targets.shape[1]], device=self.device) hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) + assert hyp_path.shape == ref_path.shape + assert hyp_scores.shape == ref_scores.shape self.assertEqual(hyp_path, ref_path) self.assertEqual(hyp_scores, ref_scores) @parameterized.expand([(torch.int32,), (torch.int64,)]) def test_forced_align_fail(self, targets_dtype): - log_probs = torch.rand(5, 6, dtype=self.dtype, device=self.device) - targets = torch.tensor([0, 1, 2, 3, 4, 4], dtype=targets_dtype, device=self.device) + log_probs = torch.rand(1, 5, 6, dtype=self.dtype, device=self.device) + targets = torch.tensor([[0, 1, 2, 3, 4, 4]], dtype=targets_dtype, device=self.device) blank = 5 - input_lengths = torch.tensor((log_probs.shape[0]), device=self.device) - target_lengths = torch.tensor((targets.shape[0]), device=self.device) + input_lengths = torch.tensor([log_probs.shape[1]], device=self.device) + target_lengths = torch.tensor([targets.shape[1]], device=self.device) with self.assertRaisesRegex(RuntimeError, r"targets length is too long for CTC"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - targets = torch.tensor([5, 3, 3], dtype=targets_dtype, device=self.device) + targets = torch.tensor([[5, 3, 3]], dtype=targets_dtype, device=self.device) with self.assertRaisesRegex(ValueError, r"targets Tensor shouldn't contain blank index"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) log_probs = log_probs.int() - targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device) + targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device) with self.assertRaisesRegex(RuntimeError, r"log_probs must be float64, float32 or float16"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) @@ -1175,40 +1180,42 @@ def test_forced_align_fail(self, targets_dtype): log_probs = torch.rand(3, 4, 6, dtype=self.dtype, device=self.device) targets = targets.int() - with self.assertRaisesRegex(RuntimeError, r"3-D tensor is not yet supported for log_probs"): + with self.assertRaisesRegex( + RuntimeError, r"The batch dimension for log_probs must be 1 at the current version" + ): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) targets = torch.randint(0, 4, (3, 4), device=self.device) - log_probs = torch.rand(3, 6, dtype=self.dtype, device=self.device) - with self.assertRaisesRegex(RuntimeError, r"2-D tensor is not yet supported for targets"): + log_probs = torch.rand(1, 3, 6, dtype=self.dtype, device=self.device) + with self.assertRaisesRegex(RuntimeError, r"The batch dimension for targets must be 1 at the current version"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - targets = torch.tensor([0, 1, 2, 3], dtype=targets_dtype, device=self.device) - input_lengths = torch.randint(1, 5, (3,), device=self.device) - with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 0-D"): + targets = torch.tensor([[0, 1, 2, 3]], dtype=targets_dtype, device=self.device) + input_lengths = torch.randint(1, 5, (3, 5), device=self.device) + with self.assertRaisesRegex(RuntimeError, r"input_lengths must be 1-D"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - input_lengths = torch.tensor((log_probs.shape[0]), device=self.device) - target_lengths = torch.randint(1, 5, (3,), device=self.device) - with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 0-D"): + input_lengths = torch.tensor([log_probs.shape[0]], device=self.device) + target_lengths = torch.randint(1, 5, (3, 5), device=self.device) + with self.assertRaisesRegex(RuntimeError, r"target_lengths must be 1-D"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - input_lengths = torch.tensor((10000), device=self.device) - target_lengths = torch.tensor((targets.shape[0]), device=self.device) + input_lengths = torch.tensor([10000], device=self.device) + target_lengths = torch.tensor([targets.shape[1]], device=self.device) with self.assertRaisesRegex(RuntimeError, r"input length mismatch"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - input_lengths = torch.tensor((log_probs.shape[0])) - target_lengths = torch.tensor((10000)) + input_lengths = torch.tensor([log_probs.shape[1]], device=self.device) + target_lengths = torch.tensor([10000], device=self.device) with self.assertRaisesRegex(RuntimeError, r"target length mismatch"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - targets = torch.tensor([7, 8, 9, 10], dtype=targets_dtype, device=self.device) - log_probs = torch.rand(10, 5, dtype=self.dtype, device=self.device) + targets = torch.tensor([[7, 8, 9, 10]], dtype=targets_dtype, device=self.device) + log_probs = torch.rand(1, 10, 5, dtype=self.dtype, device=self.device) with self.assertRaisesRegex(ValueError, r"targets values must be less than the CTC dimension"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) - targets = torch.tensor([1, 3, 3], dtype=targets_dtype, device=self.device) + targets = torch.tensor([[1, 3, 3]], dtype=targets_dtype, device=self.device) blank = 10000 with self.assertRaisesRegex(RuntimeError, r"blank must be within \[0, num classes\)"): hyp_path, hyp_scores = F.forced_align(log_probs, targets, input_lengths, target_lengths, blank) @@ -1238,14 +1245,14 @@ class FunctionalCUDAOnly(TestBaseMixin): @nested_params( [torch.half, torch.float, torch.double], [torch.int32, torch.int64], - [(50, 100), (100, 100)], - [(10,), (40,), (45,)], + [(1, 50, 100), (1, 100, 100)], + [(1, 10), (1, 40), (1, 45)], ) def test_forced_align_same_result(self, log_probs_dtype, targets_dtype, log_probs_shape, targets_shape): log_probs = torch.rand(log_probs_shape, dtype=log_probs_dtype, device=self.device) targets = torch.randint(1, 100, targets_shape, dtype=targets_dtype, device=self.device) - input_lengths = torch.tensor((log_probs.shape[0]), device=self.device) - target_lengths = torch.tensor((targets.shape[0]), device=self.device) + input_lengths = torch.tensor([log_probs.shape[1]], device=self.device) + target_lengths = torch.tensor([targets.shape[1]], device=self.device) log_probs_cuda = log_probs.cuda() targets_cuda = targets.cuda() input_lengths_cuda = input_lengths.cuda() diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt deleted file mode 100644 index 07cd3c9d4b..0000000000 --- a/third_party/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -set(CMAKE_CXX_VISIBILITY_PRESET hidden) - -file(MAKE_DIRECTORY install/include) -file(MAKE_DIRECTORY install/lib) - -################################################################################ -# sox -################################################################################ -if (BUILD_SOX) - add_subdirectory(sox) -endif() diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index 95b59c207c..2e5610bc79 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -37,7 +37,6 @@ def _get_build(var, default=False): _BUILD_RIR = _get_build("BUILD_RIR", True) _BUILD_RNNT = _get_build("BUILD_RNNT", True) _USE_FFMPEG = _get_build("USE_FFMPEG", False) -_DLOPEN_FFMPEG = _get_build("DLOPEN_FFMPEG", False) _USE_ROCM = _get_build("USE_ROCM", torch.backends.cuda.is_built() and torch.version.hip is not None) _USE_CUDA = _get_build("USE_CUDA", torch.backends.cuda.is_built() and torch.version.hip is None) _BUILD_ALIGN = _get_build("BUILD_ALIGN", True) @@ -125,7 +124,6 @@ def build_extension(self, ext): f"-DUSE_CUDA:BOOL={'ON' if _USE_CUDA else 'OFF'}", f"-DUSE_OPENMP:BOOL={'ON' if _USE_OPENMP else 'OFF'}", f"-DUSE_FFMPEG:BOOL={'ON' if _USE_FFMPEG else 'OFF'}", - f"-DDLOPEN_FFMPEG:BOOL={'ON' if _DLOPEN_FFMPEG else 'OFF'}", ] build_args = ["--target", "install"] # Pass CUDA architecture to cmake diff --git a/torchaudio/csrc/ffmpeg/CMakeLists.txt b/torchaudio/csrc/ffmpeg/CMakeLists.txt index 849d83d62f..e3445265b5 100644 --- a/torchaudio/csrc/ffmpeg/CMakeLists.txt +++ b/torchaudio/csrc/ffmpeg/CMakeLists.txt @@ -2,13 +2,11 @@ message(STATUS "FFMPEG_ROOT=$ENV{FFMPEG_ROOT}") find_package(FFMPEG 4.1 REQUIRED COMPONENTS avdevice avfilter avformat avcodec avutil) add_library(ffmpeg INTERFACE) target_include_directories(ffmpeg INTERFACE "${FFMPEG_INCLUDE_DIRS}") -if (NOT DLOPEN_FFMPEG) target_link_libraries(ffmpeg INTERFACE "${FFMPEG_LIBRARIES}") -endif() + set( sources - stub.cpp ffmpeg.cpp filter_graph.cpp hw_context.cpp @@ -33,24 +31,24 @@ if (USE_CUDA) cuda_deps) endif() -if (DLOPEN_FFMPEG) - set(compile_definitions DLOPEN_FFMPEG) -endif() - torchaudio_library( libtorchaudio_ffmpeg "${sources}" "" "torch;ffmpeg;${additional_lib}" - "${compile_definitions}" + "" ) if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) + set( + ext_sources + pybind/pybind.cpp + ) torchaudio_extension( _torchaudio_ffmpeg - pybind/pybind.cpp + "${ext_sources}" "" "libtorchaudio_ffmpeg" - "${compile_definitions}" + "" ) endif () diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp index 55e6c142b9..7822b30392 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp +++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -8,12 +7,6 @@ namespace torchaudio::io { -std::string av_err2string(int errnum) { - char str[AV_ERROR_MAX_STRING_SIZE]; - FFMPEG av_strerror(errnum, str, AV_ERROR_MAX_STRING_SIZE); - return str; -} - //////////////////////////////////////////////////////////////////////////////// // AVDictionary //////////////////////////////////////////////////////////////////////////////// @@ -21,7 +14,7 @@ AVDictionary* get_option_dict(const c10::optional& option) { AVDictionary* opt = nullptr; if (option) { for (auto const& [key, value] : option.value()) { - FFMPEG av_dict_set(&opt, key.c_str(), value.c_str(), 0); + av_dict_set(&opt, key.c_str(), value.c_str(), 0); } } return opt; @@ -32,10 +25,10 @@ void clean_up_dict(AVDictionary* p) { std::vector unused_keys; // Check and copy unused keys, clean up the original dictionary AVDictionaryEntry* t = nullptr; - while ((t = FFMPEG av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { + while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { unused_keys.emplace_back(t->key); } - FFMPEG av_dict_free(&p); + av_dict_free(&p); TORCH_CHECK( unused_keys.empty(), "Unexpected options: ", @@ -47,14 +40,14 @@ void clean_up_dict(AVDictionary* p) { // AVFormatContext //////////////////////////////////////////////////////////////////////////////// void AVFormatInputContextDeleter::operator()(AVFormatContext* p) { - FFMPEG avformat_close_input(&p); + avformat_close_input(&p); }; AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p) : Wrapper(p) {} void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) { - FFMPEG avformat_free_context(p); + avformat_free_context(p); }; AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) @@ -64,9 +57,9 @@ AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) // AVIO //////////////////////////////////////////////////////////////////////////////// void AVIOContextDeleter::operator()(AVIOContext* p) { - FFMPEG avio_flush(p); - FFMPEG av_freep(&p->buffer); - FFMPEG av_freep(&p); + avio_flush(p); + av_freep(&p->buffer); + av_freep(&p); }; AVIOContextPtr::AVIOContextPtr(AVIOContext* p) @@ -76,13 +69,13 @@ AVIOContextPtr::AVIOContextPtr(AVIOContext* p) // AVPacket //////////////////////////////////////////////////////////////////////////////// void AVPacketDeleter::operator()(AVPacket* p) { - FFMPEG av_packet_free(&p); + av_packet_free(&p); }; AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper(p) {} AVPacketPtr alloc_avpacket() { - AVPacket* p = FFMPEG av_packet_alloc(); + AVPacket* p = av_packet_alloc(); TORCH_CHECK(p, "Failed to allocate AVPacket object."); return AVPacketPtr{p}; } @@ -92,7 +85,7 @@ AVPacketPtr alloc_avpacket() { //////////////////////////////////////////////////////////////////////////////// AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){}; AutoPacketUnref::~AutoPacketUnref() { - FFMPEG av_packet_unref(p_); + av_packet_unref(p_); } AutoPacketUnref::operator AVPacket*() const { return p_; @@ -102,13 +95,13 @@ AutoPacketUnref::operator AVPacket*() const { // AVFrame //////////////////////////////////////////////////////////////////////////////// void AVFrameDeleter::operator()(AVFrame* p) { - FFMPEG av_frame_free(&p); + av_frame_free(&p); }; AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper(p) {} AVFramePtr alloc_avframe() { - AVFrame* p = FFMPEG av_frame_alloc(); + AVFrame* p = av_frame_alloc(); TORCH_CHECK(p, "Failed to allocate AVFrame object."); return AVFramePtr{p}; }; @@ -117,7 +110,7 @@ AVFramePtr alloc_avframe() { // AVCodecContext //////////////////////////////////////////////////////////////////////////////// void AVCodecContextDeleter::operator()(AVCodecContext* p) { - FFMPEG avcodec_free_context(&p); + avcodec_free_context(&p); }; AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) @@ -127,7 +120,7 @@ AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) // AVBufferRefPtr //////////////////////////////////////////////////////////////////////////////// void AutoBufferUnref::operator()(AVBufferRef* p) { - FFMPEG av_buffer_unref(&p); + av_buffer_unref(&p); } AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) @@ -137,7 +130,7 @@ AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) // AVFilterGraph //////////////////////////////////////////////////////////////////////////////// void AVFilterGraphDeleter::operator()(AVFilterGraph* p) { - FFMPEG avfilter_graph_free(&p); + avfilter_graph_free(&p); }; AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) @@ -147,7 +140,7 @@ AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) // AVCodecParameters //////////////////////////////////////////////////////////////////////////////// void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) { - FFMPEG avcodec_parameters_free(&codecpar); + avcodec_parameters_free(&codecpar); } AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p) diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h index 83d18464fa..0bae00c12d 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.h +++ b/torchaudio/csrc/ffmpeg/ffmpeg.h @@ -41,7 +41,10 @@ using OptionDict = std::map; // Replacement of av_err2str, which causes // `error: taking address of temporary array` // https://github.com/joncampbell123/composite-video-simulator/issues/5 -std::string av_err2string(int errnum); +av_always_inline std::string av_err2string(int errnum) { + char str[AV_ERROR_MAX_STRING_SIZE]; + return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum); +} // Base structure that handles memory management. // Resource is freed by the destructor of unique_ptr, diff --git a/torchaudio/csrc/ffmpeg/filter_graph.cpp b/torchaudio/csrc/ffmpeg/filter_graph.cpp index faa3606e08..1a1e40b011 100644 --- a/torchaudio/csrc/ffmpeg/filter_graph.cpp +++ b/torchaudio/csrc/ffmpeg/filter_graph.cpp @@ -1,12 +1,11 @@ #include -#include #include namespace torchaudio::io { namespace { AVFilterGraph* get_filter_graph() { - AVFilterGraph* ptr = FFMPEG avfilter_graph_alloc(); + AVFilterGraph* ptr = avfilter_graph_alloc(); TORCH_CHECK(ptr, "Failed to allocate resouce."); ptr->nb_threads = 1; return ptr; @@ -32,7 +31,7 @@ std::string get_audio_src_args( time_base.num, time_base.den, sample_rate, - FFMPEG av_get_sample_fmt_name(format), + av_get_sample_fmt_name(format), channel_layout); return std::string(args); } @@ -51,7 +50,7 @@ std::string get_video_src_args( "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d", width, height, - FFMPEG av_get_pix_fmt_name(format), + av_get_pix_fmt_name(format), time_base.num, time_base.den, frame_rate.num, @@ -69,7 +68,7 @@ void FilterGraph::add_audio_src( int sample_rate, uint64_t channel_layout) { add_src( - FFMPEG avfilter_get_by_name("abuffer"), + avfilter_get_by_name("abuffer"), get_audio_src_args(format, time_base, sample_rate, channel_layout)); } @@ -81,13 +80,13 @@ void FilterGraph::add_video_src( int height, AVRational sample_aspect_ratio) { add_src( - FFMPEG avfilter_get_by_name("buffer"), + avfilter_get_by_name("buffer"), get_video_src_args( format, time_base, frame_rate, width, height, sample_aspect_ratio)); } void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { - int ret = FFMPEG avfilter_graph_create_filter( + int ret = avfilter_graph_create_filter( &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph); TORCH_CHECK( ret >= 0, @@ -96,11 +95,11 @@ void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { } void FilterGraph::add_audio_sink() { - add_sink(FFMPEG avfilter_get_by_name("abuffersink")); + add_sink(avfilter_get_by_name("abuffersink")); } void FilterGraph::add_video_sink() { - add_sink(FFMPEG avfilter_get_by_name("buffersink")); + add_sink(avfilter_get_by_name("buffersink")); } void FilterGraph::add_sink(const AVFilter* buffersink) { @@ -114,7 +113,7 @@ void FilterGraph::add_sink(const AVFilter* buffersink) { // According to the other example // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html // `abuffersink` should not take options, and this resolved issue. - int ret = FFMPEG avfilter_graph_create_filter( + int ret = avfilter_graph_create_filter( &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph); TORCH_CHECK(ret >= 0, "Failed to create output filter."); } @@ -131,15 +130,15 @@ class InOuts { public: InOuts(const char* name, AVFilterContext* pCtx) { - p = FFMPEG avfilter_inout_alloc(); + p = avfilter_inout_alloc(); TORCH_CHECK(p, "Failed to allocate AVFilterInOut."); - p->name = FFMPEG av_strdup(name); + p->name = av_strdup(name); p->filter_ctx = pCtx; p->pad_idx = 0; p->next = nullptr; } ~InOuts() { - FFMPEG avfilter_inout_free(&p); + avfilter_inout_free(&p); } operator AVFilterInOut**() { return &p; @@ -156,7 +155,7 @@ void FilterGraph::add_process(const std::string& filter_description) { // If you are debugging this part of the code, you might get confused. InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx}; - int ret = FFMPEG avfilter_graph_parse_ptr( + int ret = avfilter_graph_parse_ptr( graph, filter_description.c_str(), out, in, nullptr); TORCH_CHECK( @@ -167,11 +166,11 @@ void FilterGraph::add_process(const std::string& filter_description) { void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) { buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx; - int ret = FFMPEG avfilter_graph_config(graph, nullptr); + int ret = avfilter_graph_config(graph, nullptr); TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret)); - // char* desc = FFMPEG avfilter_graph_dump(graph, NULL); + // char* desc = avfilter_graph_dump(graph, NULL); // std::cerr << "Filter created:\n" << desc << std::endl; - // FFMPEG av_free(static_cast(desc)); + // av_free(static_cast(desc)); } ////////////////////////////////////////////////////////////////////////////// @@ -191,8 +190,7 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { ret.num_channels = l->ch_layout.nb_channels; #else // Before FFmpeg 5.1 - ret.num_channels = - FFMPEG av_get_channel_layout_nb_channels(l->channel_layout); + ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout); #endif break; } @@ -215,12 +213,12 @@ FilterGraphOutputInfo FilterGraph::get_output_info() const { // Streaming process ////////////////////////////////////////////////////////////////////////////// int FilterGraph::add_frame(AVFrame* pInputFrame) { - return FFMPEG av_buffersrc_add_frame_flags( + return av_buffersrc_add_frame_flags( buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF); } int FilterGraph::get_frame(AVFrame* pOutputFrame) { - return FFMPEG av_buffersink_get_frame(buffersink_ctx, pOutputFrame); + return av_buffersink_get_frame(buffersink_ctx, pOutputFrame); } } // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/hw_context.cpp b/torchaudio/csrc/ffmpeg/hw_context.cpp index 5c84f3dd09..a1d7f3c7a0 100644 --- a/torchaudio/csrc/ffmpeg/hw_context.cpp +++ b/torchaudio/csrc/ffmpeg/hw_context.cpp @@ -1,5 +1,4 @@ #include -#include namespace torchaudio::io { namespace { @@ -16,7 +15,7 @@ AVBufferRef* get_cuda_context(int index) { } if (CUDA_CONTEXT_CACHE.count(index) == 0) { AVBufferRef* p = nullptr; - int ret = FFMPEG av_hwdevice_ctx_create( + int ret = av_hwdevice_ctx_create( &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0); TORCH_CHECK( ret >= 0, diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp index 5fcb9f6df8..95db01fcec 100644 --- a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp +++ b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp @@ -2,7 +2,6 @@ #include #include #include -#include namespace torchaudio::io { namespace { @@ -10,15 +9,15 @@ namespace { std::map> get_versions() { std::map> ret; -#define add_version(NAME) \ - { \ - int ver = FFMPEG NAME##_version(); \ - ret.emplace( \ - "lib" #NAME, \ - std::make_tuple<>( \ - AV_VERSION_MAJOR(ver), \ - AV_VERSION_MINOR(ver), \ - AV_VERSION_MICRO(ver))); \ +#define add_version(NAME) \ + { \ + int ver = NAME##_version(); \ + ret.emplace( \ + "lib" #NAME, \ + std::make_tuple<>( \ + AV_VERSION_MAJOR(ver), \ + AV_VERSION_MINOR(ver), \ + AV_VERSION_MICRO(ver))); \ } add_version(avutil); @@ -35,7 +34,7 @@ std::map get_demuxers(bool req_device) { std::map ret; const AVInputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = FFMPEG av_demuxer_iterate(&i))) { + while ((fmt = av_demuxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -52,7 +51,7 @@ std::map get_muxers(bool req_device) { std::map ret; const AVOutputFormat* fmt = nullptr; void* i = nullptr; - while ((fmt = FFMPEG av_muxer_iterate(&i))) { + while ((fmt = av_muxer_iterate(&i))) { assert(fmt); bool is_device = [&]() { const AVClass* avclass = fmt->priv_class; @@ -71,10 +70,10 @@ std::map get_codecs( const AVCodec* c = nullptr; void* i = nullptr; std::map ret; - while ((c = FFMPEG av_codec_iterate(&i))) { + while ((c = av_codec_iterate(&i))) { assert(c); - if ((req_encoder && FFMPEG av_codec_is_encoder(c)) || - (!req_encoder && FFMPEG av_codec_is_decoder(c))) { + if ((req_encoder && av_codec_is_encoder(c)) || + (!req_encoder && av_codec_is_decoder(c))) { if (c->type == type && c->name) { ret.emplace(c->name, c->long_name ? c->long_name : ""); } @@ -87,7 +86,7 @@ std::vector get_protocols(bool output) { void* opaque = nullptr; const char* name = nullptr; std::vector ret; - while ((name = FFMPEG avio_enum_protocols(&opaque, output))) { + while ((name = avio_enum_protocols(&opaque, output))) { assert(name); ret.emplace_back(name); } @@ -95,7 +94,7 @@ std::vector get_protocols(bool output) { } std::string get_build_config() { - return FFMPEG avcodec_configuration(); + return avcodec_configuration(); } ////////////////////////////////////////////////////////////////////////////// @@ -188,9 +187,9 @@ struct StreamWriterFileObj : private FileObj, public StreamWriterCustomIO { }; PYBIND11_MODULE(_torchaudio_ffmpeg, m) { - m.def("init", []() { FFMPEG avdevice_register_all(); }); - m.def("get_log_level", []() { return FFMPEG av_log_get_level(); }); - m.def("set_log_level", [](int level) { FFMPEG av_log_set_level(level); }); + m.def("init", []() { avdevice_register_all(); }); + m.def("get_log_level", []() { return av_log_get_level(); }); + m.def("set_log_level", [](int level) { av_log_set_level(level); }); m.def("get_versions", &get_versions); m.def("get_muxers", []() { return get_muxers(false); }); m.def("get_demuxers", []() { return get_demuxers(false); }); @@ -246,22 +245,21 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const OutputStreamInfo& o) -> std::string { - return FFMPEG av_get_media_type_string(o.media_type); + return av_get_media_type_string(o.media_type); }) .def_property_readonly( "format", [](const OutputStreamInfo& o) -> std::string { switch (o.media_type) { case AVMEDIA_TYPE_AUDIO: - return FFMPEG av_get_sample_fmt_name( - (AVSampleFormat)(o.format)); + return av_get_sample_fmt_name((AVSampleFormat)(o.format)); case AVMEDIA_TYPE_VIDEO: - return FFMPEG av_get_pix_fmt_name((AVPixelFormat)(o.format)); + return av_get_pix_fmt_name((AVPixelFormat)(o.format)); default: TORCH_INTERNAL_ASSERT( false, "FilterGraph is returning unexpected media type: ", - FFMPEG av_get_media_type_string(o.media_type)); + av_get_media_type_string(o.media_type)); } }) .def_readonly("sample_rate", &OutputStreamInfo::sample_rate) @@ -285,7 +283,7 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) { .def_property_readonly( "media_type", [](const SrcStreamInfo& s) { - return FFMPEG av_get_media_type_string(s.media_type); + return av_get_media_type_string(s.media_type); }) .def_readonly("codec_name", &SrcStreamInfo::codec_name) .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name) diff --git a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp index cf126d16a2..406f4e91bf 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/conversion.cpp @@ -1,6 +1,5 @@ #include #include -#include #ifdef USE_CUDA #include @@ -429,11 +428,11 @@ void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_NV12 == sw_fmt, "Expected NV12 format. Found: ", - FFMPEG av_get_pix_fmt_name(sw_fmt)); + av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -510,11 +509,11 @@ void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_P010 == sw_fmt, "Expected P010 format. Found: ", - FFMPEG av_get_pix_fmt_name(sw_fmt)); + av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( @@ -591,11 +590,11 @@ void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_YUV444P == sw_fmt, "Expected YUV444P format. Found: ", - FFMPEG av_get_pix_fmt_name(sw_fmt)); + av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly for (int i = 0; i < 3; ++i) { diff --git a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp index 883999fa41..bcff81dc3b 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/packet_buffer.cpp @@ -1,11 +1,9 @@ #include -#include namespace torchaudio::io { - void PacketBuffer::push_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null."); - AVPacket* p = FFMPEG av_packet_clone(packet); + AVPacket* p = av_packet_clone(packet); TORCH_INTERNAL_ASSERT(p, "Failed to clone packet."); packets.emplace_back(p); } diff --git a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp index 4f397d8b49..38440e3e33 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/post_process.cpp @@ -2,7 +2,6 @@ #include #include #include -#include namespace torchaudio::io { namespace detail { @@ -49,7 +48,7 @@ FilterGraphFactory get_video_factory( f.add_video_sink(); f.add_process(filter_desc); if (hw_frames_ctx) { - f.create_filter(FFMPEG av_buffer_ref(hw_frames_ctx)); + f.create_filter(av_buffer_ref(hw_frames_ctx)); } else { f.create_filter(); } @@ -140,7 +139,7 @@ struct ProcessImpl : public IPostDecodeProcess { if (ret >= 0) { buffer.push_frame(converter.convert(frame), frame->pts); } - FFMPEG av_frame_unref(frame); + av_frame_unref(frame); } return ret; } @@ -160,7 +159,7 @@ std::unique_ptr get_unchunked_audio_process( TORCH_INTERNAL_ASSERT( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); using B = UnchunkedBuffer; @@ -227,7 +226,7 @@ std::unique_ptr get_unchunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); } } @@ -240,7 +239,7 @@ std::unique_ptr get_chunked_audio_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_AUDIO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); using B = ChunkedBuffer; B buffer{i.time_base, frames_per_chunk, num_chunks}; @@ -308,7 +307,7 @@ std::unique_ptr get_chunked_audio_process( } default: TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", FFMPEG av_get_sample_fmt_name(fmt)); + false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); } } @@ -322,7 +321,7 @@ std::unique_ptr get_unchunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -376,9 +375,7 @@ std::unique_ptr get_unchunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, - "Unexpected video format found: ", - FFMPEG av_get_pix_fmt_name(fmt)); + false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); } } } @@ -396,7 +393,7 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); using B = UnchunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -419,13 +416,13 @@ std::unique_ptr get_unchunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); } } #endif @@ -440,7 +437,7 @@ std::unique_ptr get_chunked_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); auto h = i.height; auto w = i.width; @@ -494,9 +491,7 @@ std::unique_ptr get_chunked_video_process( } default: { TORCH_INTERNAL_ASSERT( - false, - "Unexpected video format found: ", - FFMPEG av_get_pix_fmt_name(fmt)); + false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); } } } @@ -516,7 +511,7 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_INTERNAL_ASSERT_DEBUG_ONLY( i.type == AVMEDIA_TYPE_VIDEO, "Unsupported media type found: ", - FFMPEG av_get_media_type_string(i.type)); + av_get_media_type_string(i.type)); using B = ChunkedBuffer; switch (auto fmt = (AVPixelFormat)i.format; fmt) { @@ -545,13 +540,13 @@ std::unique_ptr get_chunked_cuda_video_process( TORCH_CHECK( false, "Unsupported video format found in CUDA HW: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); } default: { TORCH_CHECK( false, "Unexpected video format found in CUDA HW: ", - FFMPEG av_get_pix_fmt_name(fmt)); + av_get_pix_fmt_name(fmt)); } } #endif diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp index ffd1ddea38..2213a4018a 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_processor.cpp @@ -1,10 +1,10 @@ #include #include -#include #include #include namespace torchaudio::io { + namespace { AVCodecContextPtr alloc_codec_context( enum AVCodecID codec_id, @@ -12,24 +12,24 @@ AVCodecContextPtr alloc_codec_context( const AVCodec* codec = [&]() { if (decoder_name) { const AVCodec* c = - FFMPEG avcodec_find_decoder_by_name(decoder_name.value().c_str()); + avcodec_find_decoder_by_name(decoder_name.value().c_str()); TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value()); return c; } else { - const AVCodec* c = FFMPEG avcodec_find_decoder(codec_id); - TORCH_CHECK(c, "Unsupported codec: ", FFMPEG avcodec_get_name(codec_id)); + const AVCodec* c = avcodec_find_decoder(codec_id); + TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id)); return c; } }(); - AVCodecContext* codec_ctx = FFMPEG avcodec_alloc_context3(codec); + AVCodecContext* codec_ctx = avcodec_alloc_context3(codec); TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext."); return AVCodecContextPtr(codec_ctx); } const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) { for (int i = 0;; ++i) { - const AVCodecHWConfig* config = FFMPEG avcodec_get_hw_config(codec, i); + const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i); if (!config) { break; } @@ -82,7 +82,7 @@ enum AVPixelFormat get_hw_format( } AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { - AVBufferRef* p = FFMPEG av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); + AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); TORCH_CHECK( p, "Failed to allocate CUDA frame context from device context at ", @@ -93,11 +93,11 @@ AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { frames_ctx->width = codec_ctx->width; frames_ctx->height = codec_ctx->height; frames_ctx->initial_pool_size = 5; - int ret = FFMPEG av_hwframe_ctx_init(p); + int ret = av_hwframe_ctx_init(p); if (ret >= 0) { return p; } - FFMPEG av_buffer_unref(&p); + av_buffer_unref(&p); TORCH_CHECK( false, "Failed to initialize CUDA frame context: ", av_err2string(ret)); } @@ -106,7 +106,7 @@ void configure_codec_context( AVCodecContext* codec_ctx, const AVCodecParameters* params, const torch::Device& device) { - int ret = FFMPEG avcodec_parameters_to_context(codec_ctx, params); + int ret = avcodec_parameters_to_context(codec_ctx, params); TORCH_CHECK( ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret)); @@ -121,8 +121,7 @@ void configure_codec_context( // 2. Set pCodecContext->get_format call back function which // will retrieve the HW pixel format from opaque pointer. codec_ctx->get_format = get_hw_format; - codec_ctx->hw_device_ctx = - FFMPEG av_buffer_ref(get_cuda_context(device.index())); + codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( codec_ctx->hw_device_ctx, "Failed to reference HW device context."); #endif @@ -135,16 +134,16 @@ void open_codec( AVDictionary* opts = get_option_dict(decoder_option); // Default to single thread execution. - if (!FFMPEG av_dict_get(opts, "threads", nullptr, 0)) { - FFMPEG av_dict_set(&opts, "threads", "1", 0); + if (!av_dict_get(opts, "threads", nullptr, 0)) { + av_dict_set(&opts, "threads", "1", 0); } if (!codec_ctx->channel_layout) { codec_ctx->channel_layout = - FFMPEG av_get_default_channel_layout(codec_ctx->channels); + av_get_default_channel_layout(codec_ctx->channels); } - int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opts); + int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts); clean_up_dict(opts); TORCH_CHECK( ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret)); @@ -259,8 +258,8 @@ void StreamProcessor::remove_stream(KeyType key) { void StreamProcessor::set_discard_timestamp(int64_t timestamp) { TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative."); - discard_before_pts = FFMPEG av_rescale_q( - timestamp, FFMPEG av_get_time_base_q(), stream_time_base); + discard_before_pts = + av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base); } void StreamProcessor::set_decoder( @@ -306,9 +305,9 @@ int StreamProcessor::process_packet(AVPacket* packet) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - int ret = FFMPEG avcodec_send_packet(codec_ctx, packet); + int ret = avcodec_send_packet(codec_ctx, packet); while (ret >= 0) { - ret = FFMPEG avcodec_receive_frame(codec_ctx, frame); + ret = avcodec_receive_frame(codec_ctx, frame); // AVERROR(EAGAIN) means that new input data is required to return new // output. if (ret == AVERROR(EAGAIN)) @@ -355,7 +354,7 @@ int StreamProcessor::process_packet(AVPacket* packet) { } // else we can just unref the frame and continue - FFMPEG av_frame_unref(frame); + av_frame_unref(frame); } return ret; } @@ -364,7 +363,7 @@ void StreamProcessor::flush() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( is_decoder_set(), "Decoder must have been set prior to calling this function."); - FFMPEG avcodec_flush_buffers(codec_ctx); + avcodec_flush_buffers(codec_ctx); for (auto& ite : post_processes) { ite.second->flush(); } diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp index 518bc02131..b8e9d7a9bf 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.cpp @@ -1,15 +1,10 @@ #include #include -#include #include #include #include #include -extern "C" { -#include -} - namespace torchaudio::io { using KeyType = StreamProcessor::KeyType; @@ -23,7 +18,7 @@ AVFormatContext* get_input_format_context( const c10::optional& format, const c10::optional& option, AVIOContext* io_ctx) { - AVFormatContext* p = FFMPEG avformat_alloc_context(); + AVFormatContext* p = avformat_alloc_context(); TORCH_CHECK(p, "Failed to allocate AVFormatContext."); if (io_ctx) { p->pb = io_ctx; @@ -33,7 +28,7 @@ AVFormatContext* get_input_format_context( if (format.has_value()) { std::string format_str = format.value(); AVFORMAT_CONST AVInputFormat* pInput = - FFMPEG av_find_input_format(format_str.c_str()); + av_find_input_format(format_str.c_str()); TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\""); return pInput; } @@ -41,7 +36,7 @@ AVFormatContext* get_input_format_context( }(); AVDictionary* opt = get_option_dict(option); - int ret = FFMPEG avformat_open_input(&p, src.c_str(), pInputFormat, &opt); + int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt); clean_up_dict(opt); TORCH_CHECK( @@ -57,7 +52,7 @@ AVFormatContext* get_input_format_context( StreamReader::StreamReader(AVFormatContext* p) : format_ctx(p) { C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamReader"); - int ret = FFMPEG avformat_find_stream_info(format_ctx, nullptr); + int ret = avformat_find_stream_info(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to find stream information: ", av_err2string(ret)); @@ -114,7 +109,7 @@ void validate_src_stream_type( "Stream ", i, " is not ", - FFMPEG av_get_media_type_string(type), + av_get_media_type_string(type), " stream."); } @@ -129,7 +124,7 @@ namespace { OptionDict parse_metadata(const AVDictionary* metadata) { AVDictionaryEntry* tag = nullptr; OptionDict ret; - while ((tag = FFMPEG av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { + while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { ret.emplace(std::string(tag->key), std::string(tag->value)); } return ret; @@ -152,8 +147,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { ret.num_frames = stream->nb_frames; ret.bits_per_sample = codecpar->bits_per_raw_sample; ret.metadata = parse_metadata(stream->metadata); - const AVCodecDescriptor* desc = - FFMPEG avcodec_descriptor_get(codecpar->codec_id); + const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); if (desc) { ret.codec_name = desc->name; ret.codec_long_name = desc->long_name; @@ -163,7 +157,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_AUDIO: { AVSampleFormat smp_fmt = static_cast(codecpar->format); if (smp_fmt != AV_SAMPLE_FMT_NONE) { - ret.fmt_name = FFMPEG av_get_sample_fmt_name(smp_fmt); + ret.fmt_name = av_get_sample_fmt_name(smp_fmt); } ret.sample_rate = static_cast(codecpar->sample_rate); ret.num_channels = codecpar->channels; @@ -172,7 +166,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { case AVMEDIA_TYPE_VIDEO: { AVPixelFormat pix_fmt = static_cast(codecpar->format); if (pix_fmt != AV_PIX_FMT_NONE) { - ret.fmt_name = FFMPEG av_get_pix_fmt_name(pix_fmt); + ret.fmt_name = av_get_pix_fmt_name(pix_fmt); } ret.width = codecpar->width; ret.height = codecpar->height; @@ -186,7 +180,7 @@ SrcStreamInfo StreamReader::get_src_stream_info(int i) const { namespace { AVCodecParameters* get_codecpar() { - AVCodecParameters* ptr = FFMPEG avcodec_parameters_alloc(); + AVCodecParameters* ptr = avcodec_parameters_alloc(); TORCH_CHECK(ptr, "Failed to allocate resource."); return ptr; } @@ -197,7 +191,7 @@ StreamParams StreamReader::get_src_stream_params(int i) { AVStream* stream = format_ctx->streams[i]; AVCodecParametersPtr codec_params(get_codecpar()); - int ret = FFMPEG avcodec_parameters_copy(codec_params, stream->codecpar); + int ret = avcodec_parameters_copy(codec_params, stream->codecpar); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -239,12 +233,12 @@ OutputStreamInfo StreamReader::get_out_stream_info(int i) const { } int64_t StreamReader::find_best_audio_stream() const { - return FFMPEG av_find_best_stream( + return av_find_best_stream( format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0); } int64_t StreamReader::find_best_video_stream() const { - return FFMPEG av_find_best_stream( + return av_find_best_stream( format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); } @@ -294,7 +288,7 @@ void StreamReader::seek(double timestamp_s, int64_t mode) { TORCH_CHECK(false, "Invalid mode value: ", mode); } - int ret = FFMPEG av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); + int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); if (ret < 0) { seek_timestamp = 0; @@ -407,12 +401,12 @@ void StreamReader::add_stream( case AVMEDIA_TYPE_AUDIO: return AVRational{0, 1}; case AVMEDIA_TYPE_VIDEO: - return FFMPEG av_guess_frame_rate(format_ctx, stream, nullptr); + return av_guess_frame_rate(format_ctx, stream, nullptr); default: TORCH_INTERNAL_ASSERT( false, "Unexpected media type is given: ", - FFMPEG av_get_media_type_string(media_type)); + av_get_media_type_string(media_type)); } }(); int key = processors[i]->add_stream( @@ -451,7 +445,7 @@ void StreamReader::remove_stream(int64_t i) { // 1: It's done, caller should stop calling // <0: Some error happened int StreamReader::process_packet() { - int ret = FFMPEG av_read_frame(format_ctx, packet); + int ret = av_read_frame(format_ctx, packet); if (ret == AVERROR_EOF) { ret = drain(); return (ret < 0) ? ret : 1; @@ -582,13 +576,12 @@ AVIOContext* get_io_context( int buffer_size, int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = - static_cast(FFMPEG av_malloc(buffer_size)); + unsigned char* buffer = static_cast(av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = FFMPEG avio_alloc_context( + AVIOContext* io_ctx = avio_alloc_context( buffer, buffer_size, 0, opaque, read_packet, nullptr, seek); if (!io_ctx) { - FFMPEG av_freep(&buffer); + av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp index 3f9a153004..c13c3cfcb9 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encode_process.cpp @@ -1,12 +1,7 @@ #include #include -#include #include -extern "C" { -#include -} - namespace torchaudio::io { //////////////////////////////////////////////////////////////////////////////// @@ -61,7 +56,7 @@ void EncodeProcess::process_frame(AVFrame* src) { if (ret >= 0) { encoder.encode(dst_frame); } - FFMPEG av_frame_unref(dst_frame); + av_frame_unref(dst_frame); } } @@ -76,8 +71,8 @@ void EncodeProcess::flush() { namespace { enum AVSampleFormat get_src_sample_fmt(const std::string& src) { - auto fmt = FFMPEG av_get_sample_fmt(src.c_str()); - if (fmt != AV_SAMPLE_FMT_NONE && !FFMPEG av_sample_fmt_is_planar(fmt)) { + auto fmt = av_get_sample_fmt(src.c_str()); + if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) { return fmt; } TORCH_CHECK( @@ -94,7 +89,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { AV_SAMPLE_FMT_S64, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_DBL}) { - ret.emplace_back(FFMPEG av_get_sample_fmt_name(fmt)); + ret.emplace_back(av_get_sample_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -102,7 +97,7 @@ enum AVSampleFormat get_src_sample_fmt(const std::string& src) { } enum AVPixelFormat get_src_pix_fmt(const std::string& src) { - AVPixelFormat fmt = FFMPEG av_get_pix_fmt(src.c_str()); + AVPixelFormat fmt = av_get_pix_fmt(src.c_str()); switch (fmt) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: @@ -123,7 +118,7 @@ enum AVPixelFormat get_src_pix_fmt(const std::string& src) { AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, AV_PIX_FMT_YUV444P}) { - ret.emplace_back(FFMPEG av_get_pix_fmt_name(fmt)); + ret.emplace_back(av_get_pix_fmt_name(fmt)); } return c10::Join(", ", ret); }(), @@ -137,21 +132,18 @@ const AVCodec* get_codec( AVCodecID default_codec, const c10::optional& encoder) { if (encoder) { - const AVCodec* c = - FFMPEG avcodec_find_encoder_by_name(encoder.value().c_str()); + const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str()); TORCH_CHECK(c, "Unexpected codec: ", encoder.value()); return c; } - const AVCodec* c = FFMPEG avcodec_find_encoder(default_codec); + const AVCodec* c = avcodec_find_encoder(default_codec); TORCH_CHECK( - c, - "Encoder not found for codec: ", - FFMPEG avcodec_get_name(default_codec)); + c, "Encoder not found for codec: ", avcodec_get_name(default_codec)); return c; } AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) { - AVCodecContext* ctx = FFMPEG avcodec_alloc_context3(codec); + AVCodecContext* ctx = avcodec_alloc_context3(codec); TORCH_CHECK(ctx, "Failed to allocate CodecContext."); if (flags & AVFMT_GLOBALHEADER) { @@ -177,25 +169,25 @@ void open_codec( // while "libopus" refers to the one depends on libopusenc // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251 if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) { - if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) { + if (!av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - FFMPEG av_dict_set(&opt, "strict", "experimental", 0); + av_dict_set(&opt, "strict", "experimental", 0); } } if (std::strcmp(codec_ctx->codec->name, "opus") == 0) { - if (!FFMPEG av_dict_get(opt, "strict", nullptr, 0)) { + if (!av_dict_get(opt, "strict", nullptr, 0)) { TORCH_WARN_ONCE( "\"opus\" encoder is selected. Enabling '-strict experimental'. ", "If this is not desired, please provide \"strict\" encoder option ", "with desired value."); - FFMPEG av_dict_set(&opt, "strict", "experimental", 0); + av_dict_set(&opt, "strict", "experimental", 0); } } - int ret = FFMPEG avcodec_open2(codec_ctx, codec_ctx->codec, &opt); + int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt); clean_up_dict(opt); TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")"); } @@ -222,7 +214,7 @@ bool supported_sample_fmt( std::string get_supported_formats(const AVSampleFormat* sample_fmts) { std::vector ret; while (*sample_fmts != AV_SAMPLE_FMT_NONE) { - ret.emplace_back(FFMPEG av_get_sample_fmt_name(*sample_fmts)); + ret.emplace_back(av_get_sample_fmt_name(*sample_fmts)); ++sample_fmts; } return c10::Join(", ", ret); @@ -234,7 +226,7 @@ AVSampleFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { auto& enc_fmt_val = encoder_format.value(); - auto fmt = FFMPEG av_get_sample_fmt(enc_fmt_val.c_str()); + auto fmt = av_get_sample_fmt(enc_fmt_val.c_str()); TORCH_CHECK( fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val); TORCH_CHECK( @@ -321,8 +313,8 @@ std::string get_supported_channels(const uint64_t* channel_layouts) { std::vector names; while (*channel_layouts) { std::stringstream ss; - ss << FFMPEG av_get_channel_layout_nb_channels(*channel_layouts); - ss << " (" << FFMPEG av_get_channel_name(*channel_layouts) << ")"; + ss << av_get_channel_layout_nb_channels(*channel_layouts); + ss << " (" << av_get_channel_name(*channel_layouts) << ")"; names.emplace_back(ss.str()); ++channel_layouts; } @@ -339,10 +331,10 @@ uint64_t get_channel_layout( TORCH_CHECK( val > 0, "The number of channels must be greater than 0. Found: ", val); if (!codec->channel_layouts) { - return static_cast(FFMPEG av_get_default_channel_layout(val)); + return static_cast(av_get_default_channel_layout(val)); } for (const uint64_t* it = codec->channel_layouts; *it; ++it) { - if (FFMPEG av_get_channel_layout_nb_channels(*it) == val) { + if (av_get_channel_layout_nb_channels(*it) == val) { return *it; } } @@ -379,9 +371,8 @@ void configure_audio_codec_ctx( const c10::optional& codec_config) { codec_ctx->sample_fmt = format; codec_ctx->sample_rate = sample_rate; - codec_ctx->time_base = av_inv_q(FFMPEG av_d2q(sample_rate, 1 << 24)); - codec_ctx->channels = - FFMPEG av_get_channel_layout_nb_channels(channel_layout); + codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24)); + codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout); codec_ctx->channel_layout = channel_layout; // Set optional stuff @@ -420,7 +411,7 @@ bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) { std::string get_supported_formats(const AVPixelFormat* pix_fmts) { std::vector ret; while (*pix_fmts != AV_PIX_FMT_NONE) { - ret.emplace_back(FFMPEG av_get_pix_fmt_name(*pix_fmts)); + ret.emplace_back(av_get_pix_fmt_name(*pix_fmts)); ++pix_fmts; } return c10::Join(", ", ret); @@ -432,7 +423,7 @@ AVPixelFormat get_enc_fmt( const AVCodec* codec) { if (encoder_format) { const auto& val = encoder_format.value(); - auto fmt = FFMPEG av_get_pix_fmt(val.c_str()); + auto fmt = av_get_pix_fmt(val.c_str()); TORCH_CHECK( supported_pix_fmt(fmt, codec->pix_fmts), codec->name, @@ -470,7 +461,7 @@ AVRational get_enc_rate( std::isfinite(enc_rate) && enc_rate > 0, "Encoder sample rate must be positive and fininte. Found: ", enc_rate); - AVRational rate = FFMPEG av_d2q(enc_rate, 1 << 24); + AVRational rate = av_d2q(enc_rate, 1 << 24); TORCH_CHECK( supported_frame_rate(rate, codec->supported_framerates), codec->name, @@ -554,14 +545,14 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { // context to AVCodecContext. But this way, it will be deallocated // automatically at the time AVCodecContext is freed, so we do that. - ctx->hw_device_ctx = FFMPEG av_buffer_ref(get_cuda_context(device.index())); + ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); TORCH_INTERNAL_ASSERT( ctx->hw_device_ctx, "Failed to reference HW device context."); ctx->sw_pix_fmt = ctx->pix_fmt; ctx->pix_fmt = AV_PIX_FMT_CUDA; - ctx->hw_frames_ctx = FFMPEG av_hwframe_ctx_alloc(ctx->hw_device_ctx); + ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx); TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context."); auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data); @@ -571,7 +562,7 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { frames_ctx->height = ctx->height; frames_ctx->initial_pool_size = 5; - int ret = FFMPEG av_hwframe_ctx_init(ctx->hw_frames_ctx); + int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx); TORCH_CHECK( ret >= 0, "Failed to initialize CUDA frame context: ", @@ -583,11 +574,11 @@ void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { //////////////////////////////////////////////////////////////////////////////// AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) { - AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr); + AVStream* stream = avformat_new_stream(format_ctx, nullptr); TORCH_CHECK(stream, "Failed to allocate stream."); stream->time_base = codec_ctx->time_base; - int ret = FFMPEG avcodec_parameters_from_context(stream->codecpar, codec_ctx); + int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx); TORCH_CHECK( ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret)); return stream; @@ -614,7 +605,7 @@ FilterGraph get_audio_filter_graph( if (filter_desc || src_fmt != enc_fmt || src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) { std::stringstream ss; - ss << "aformat=sample_fmts=" << FFMPEG av_get_sample_fmt_name(enc_fmt) + ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt) << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x" << std::hex << enc_ch_layout; parts.push_back(ss.str()); @@ -665,7 +656,7 @@ FilterGraph get_video_filter_graph( } if (filter_desc || src_fmt != enc_fmt) { std::stringstream ss; - ss << "format=" << FFMPEG av_get_pix_fmt_name(enc_fmt); + ss << "format=" << av_get_pix_fmt_name(enc_fmt); parts.emplace_back(ss.str()); } if (filter_desc || @@ -709,7 +700,7 @@ AVFramePtr get_audio_frame( frame->channel_layout = channel_layout; frame->sample_rate = sample_rate; frame->nb_samples = nb_samples; - int ret = FFMPEG av_frame_get_buffer(frame, 0); + int ret = av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating the source audio frame:", av_err2string(ret)); @@ -725,7 +716,7 @@ AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) { frame->format = src_fmt; frame->width = width; frame->height = height; - int ret = FFMPEG av_frame_get_buffer(frame, 0); + int ret = av_frame_get_buffer(frame, 0); TORCH_CHECK( ret >= 0, "Error allocating a video buffer :", av_err2string(ret)); @@ -770,10 +761,10 @@ EncodeProcess get_audio_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVSampleFormat src_fmt = (disable_converter) - ? FFMPEG av_get_sample_fmt(format.c_str()) + ? av_get_sample_fmt(format.c_str()) : get_src_sample_fmt(format); - const auto src_ch_layout = static_cast( - FFMPEG av_get_default_channel_layout(src_num_channels)); + const auto src_ch_layout = + static_cast(av_get_default_channel_layout(src_num_channels)); // 2. Fetch codec from default or override TORCH_CHECK( @@ -793,7 +784,7 @@ EncodeProcess get_audio_encode_process( // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277 // This is the case for at least until FFmpeg 6.0, so it will be // like this for a while. - return static_cast(FFMPEG av_get_default_channel_layout(2)); + return static_cast(av_get_default_channel_layout(2)); } return get_channel_layout(src_ch_layout, encoder_num_channels, codec); }(); @@ -881,9 +872,9 @@ EncodeProcess get_video_encode_process( // case, restrictions on the format to support tensor inputs do not apply, and // so we directly get the format via FFmpeg. const AVPixelFormat src_fmt = (disable_converter) - ? FFMPEG av_get_pix_fmt(format.c_str()) + ? av_get_pix_fmt(format.c_str()) : get_src_pix_fmt(format); - const AVRational src_rate = FFMPEG av_d2q(frame_rate, 1 << 24); + const AVRational src_rate = av_d2q(frame_rate, 1 << 24); // 2. Fetch codec from default or override TORCH_CHECK( @@ -950,8 +941,7 @@ EncodeProcess get_video_encode_process( AVFramePtr src_frame = [&]() { if (codec_ctx->hw_frames_ctx) { AVFramePtr frame{alloc_avframe()}; - int ret = - FFMPEG av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); + int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret)); frame->nb_samples = 1; frame->pts = 0; diff --git a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp index 7552484f2a..3d2e501535 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/encoder.cpp @@ -1,5 +1,4 @@ #include -#include namespace torchaudio::io { @@ -14,10 +13,10 @@ Encoder::Encoder( /// /// @param frame Frame data to encode void Encoder::encode(AVFrame* frame) { - int ret = FFMPEG avcodec_send_frame(codec_ctx, frame); + int ret = avcodec_send_frame(codec_ctx, frame); TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ")."); while (ret >= 0) { - ret = FFMPEG avcodec_receive_packet(codec_ctx, packet); + ret = avcodec_receive_packet(codec_ctx, packet); if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { if (ret == AVERROR_EOF) { // Note: @@ -32,7 +31,7 @@ void Encoder::encode(AVFrame* frame) { // An alternative is to use `av_write_frame` functoin, but in that case // client code is responsible for ordering packets, which makes it // complicated to use StreamWriter - ret = FFMPEG av_interleaved_write_frame(format_ctx, nullptr); + ret = av_interleaved_write_frame(format_ctx, nullptr); TORCH_CHECK( ret >= 0, "Failed to flush packet (", av_err2string(ret), ")."); } @@ -52,11 +51,10 @@ void Encoder::encode(AVFrame* frame) { // This has to be set before av_packet_rescale_ts bellow. packet->duration = 1; } - FFMPEG av_packet_rescale_ts( - packet, codec_ctx->time_base, stream->time_base); + av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base); packet->stream_index = stream->index; - ret = FFMPEG av_interleaved_write_frame(format_ctx, packet); + ret = av_interleaved_write_frame(format_ctx, packet); TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ")."); } } diff --git a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp index 45872a6af5..0701c5a596 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/packet_writer.cpp @@ -1,14 +1,13 @@ #include -#include namespace torchaudio::io { namespace { AVStream* add_stream( AVFormatContext* format_ctx, const StreamParams& stream_params) { - AVStream* stream = FFMPEG avformat_new_stream(format_ctx, nullptr); - int ret = FFMPEG avcodec_parameters_copy( - stream->codecpar, stream_params.codec_params); + AVStream* stream = avformat_new_stream(format_ctx, nullptr); + int ret = + avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); TORCH_CHECK( ret >= 0, "Failed to copy the stream's codec parameters. (", @@ -27,12 +26,11 @@ PacketWriter::PacketWriter( void PacketWriter::write_packet(const AVPacketPtr& packet) { AVPacket dst_packet; - int ret = FFMPEG av_packet_ref(&dst_packet, packet); + int ret = av_packet_ref(&dst_packet, packet); TORCH_CHECK(ret >= 0, "Failed to copy packet."); - FFMPEG av_packet_rescale_ts( - &dst_packet, original_time_base, stream->time_base); + av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); dst_packet.stream_index = stream->index; - ret = FFMPEG av_interleaved_write_frame(format_ctx, &dst_packet); + ret = av_interleaved_write_frame(format_ctx, &dst_packet); TORCH_CHECK(ret >= 0, "Failed to write packet to destination."); } } // namespace torchaudio::io diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp index 4252cd7072..df51d92355 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp @@ -1,11 +1,11 @@ #include -#include #ifdef USE_CUDA #include #endif -namespace torchaudio::io { +namespace torchaudio { +namespace io { namespace { AVFormatContext* get_output_format_context( @@ -19,7 +19,7 @@ AVFormatContext* get_output_format_context( } AVFormatContext* p = nullptr; - int ret = FFMPEG avformat_alloc_output_context2( + int ret = avformat_alloc_output_context2( &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str()); TORCH_CHECK( ret >= 0, @@ -208,14 +208,14 @@ void StreamWriter::add_video_frame_stream( } void StreamWriter::set_metadata(const OptionDict& metadata) { - FFMPEG av_dict_free(&format_ctx->metadata); + av_dict_free(&format_ctx->metadata); for (auto const& [key, value] : metadata) { - FFMPEG av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); + av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); } } void StreamWriter::dump_format(int64_t i) { - FFMPEG av_dump_format(format_ctx, (int)i, format_ctx->url, 1); + av_dump_format(format_ctx, (int)i, format_ctx->url, 1); } void StreamWriter::open(const c10::optional& option) { @@ -231,10 +231,10 @@ void StreamWriter::open(const c10::optional& option) { AVDictionary* opt = get_option_dict(option); if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { - ret = FFMPEG avio_open2( + ret = avio_open2( &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt); if (ret < 0) { - FFMPEG av_dict_free(&opt); + av_dict_free(&opt); TORCH_CHECK( false, "Failed to open dst: ", @@ -245,7 +245,7 @@ void StreamWriter::open(const c10::optional& option) { } } - ret = FFMPEG avformat_write_header(format_ctx, &opt); + ret = avformat_write_header(format_ctx, &opt); clean_up_dict(opt); TORCH_CHECK( ret >= 0, @@ -258,7 +258,7 @@ void StreamWriter::open(const c10::optional& option) { } void StreamWriter::close() { - int ret = FFMPEG av_write_trailer(format_ctx); + int ret = av_write_trailer(format_ctx); if (ret < 0) { LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ")."; } @@ -269,7 +269,7 @@ void StreamWriter::close() { if (!(fmt->flags & AVFMT_NOFILE) && !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { // avio_closep can be only applied to AVIOContext opened by avio_open - FFMPEG avio_closep(&(format_ctx->pb)); + avio_closep(&(format_ctx->pb)); } is_open = false; } @@ -355,13 +355,12 @@ AVIOContext* get_io_context( int buffer_size, int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = - static_cast(FFMPEG av_malloc(buffer_size)); + unsigned char* buffer = static_cast(av_malloc(buffer_size)); TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = FFMPEG avio_alloc_context( + AVIOContext* io_ctx = avio_alloc_context( buffer, buffer_size, 1, opaque, nullptr, write_packet, seek); if (!io_ctx) { - FFMPEG av_freep(&buffer); + av_freep(&buffer); TORCH_CHECK(false, "Failed to allocate AVIOContext."); } return io_ctx; @@ -385,4 +384,5 @@ StreamWriterCustomIO::StreamWriterCustomIO( : CustomOutput(opaque, buffer_size, write_packet, seek), StreamWriter(io_ctx, format) {} -} // namespace torchaudio::io +} // namespace io +} // namespace torchaudio diff --git a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp index 1478d38d5a..e9350f0479 100644 --- a/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp +++ b/torchaudio/csrc/ffmpeg/stream_writer/tensor_converter.cpp @@ -1,11 +1,11 @@ #include -#include #ifdef USE_CUDA #include #endif namespace torchaudio::io { + namespace { using InitFunc = TensorConverter::InitFunc; @@ -41,8 +41,8 @@ void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334 - if (!FFMPEG av_frame_is_writable(buffer)) { - int ret = FFMPEG av_frame_make_writable(buffer); + if (!av_frame_is_writable(buffer)) { + int ret = av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -145,8 +145,8 @@ void write_interlaced_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!FFMPEG av_frame_is_writable(buffer)) { - int ret = FFMPEG av_frame_make_writable(buffer); + if (!av_frame_is_writable(buffer)) { + int ret = av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -187,7 +187,7 @@ void write_planar_video( AVFrame* buffer, int num_planes) { const auto num_colors = - FFMPEG av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; + av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors); @@ -195,8 +195,8 @@ void write_planar_video( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width); // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!FFMPEG av_frame_is_writable(buffer)) { - int ret = FFMPEG av_frame_make_writable(buffer); + if (!av_frame_is_writable(buffer)) { + int ret = av_frame_make_writable(buffer); TORCH_INTERNAL_ASSERT( ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); } @@ -308,7 +308,7 @@ std::pair get_video_func(AVFrame* buffer) { TORCH_CHECK( false, "Unexpected pixel format for CUDA: ", - FFMPEG av_get_pix_fmt_name(sw_pix_fmt)); + av_get_pix_fmt_name(sw_pix_fmt)); } } @@ -317,7 +317,7 @@ std::pair get_video_func(AVFrame* buffer) { case AV_PIX_FMT_GRAY8: case AV_PIX_FMT_RGB24: case AV_PIX_FMT_BGR24: { - int channels = FFMPEG av_pix_fmt_desc_get(pix_fmt)->nb_components; + int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components; InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) { validate_video_input(t, f, channels); return init_interlaced(t); @@ -339,9 +339,7 @@ std::pair get_video_func(AVFrame* buffer) { } default: TORCH_CHECK( - false, - "Unexpected pixel format: ", - FFMPEG av_get_pix_fmt_name(pix_fmt)); + false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt)); } } @@ -385,9 +383,7 @@ TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size) break; default: TORCH_INTERNAL_ASSERT( - false, - "Unsupported media type: ", - FFMPEG av_get_media_type_string(type)); + false, "Unsupported media type: ", av_get_media_type_string(type)); } } diff --git a/torchaudio/csrc/ffmpeg/stub.cpp b/torchaudio/csrc/ffmpeg/stub.cpp deleted file mode 100644 index 4960b0050e..0000000000 --- a/torchaudio/csrc/ffmpeg/stub.cpp +++ /dev/null @@ -1,196 +0,0 @@ -#ifdef DLOPEN_FFMPEG - -#include -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -} - -namespace torchaudio::io::detail { -namespace { -class StubImpl { - at::DynamicLibrary libavutil; - at::DynamicLibrary libavcodec; - at::DynamicLibrary libavformat; - at::DynamicLibrary libavdevice; - at::DynamicLibrary libavfilter; - - public: - // The struct that holds all the function pointers to be used. - FFmpegStub stub{}; - - StubImpl( - const char* util, - const char* codec, - const char* format, - const char* device, - const char* filter) - : libavutil(util), - libavcodec(codec), - libavformat(format), - libavdevice(device), - libavfilter(filter) { -#define set(X) stub.X = (decltype(FFmpegStub::X))libavutil.sym(#X) - set(av_buffer_ref); - set(av_buffer_unref); - set(av_d2q); - set(av_dict_free); - set(av_dict_get); - set(av_dict_set); - set(av_frame_alloc); - set(av_frame_free); - set(av_frame_get_buffer); - set(av_frame_is_writable); - set(av_frame_make_writable); - set(av_frame_unref); - set(av_freep); - set(av_get_channel_layout_nb_channels); - set(av_get_channel_name); - set(av_get_default_channel_layout); - set(av_get_media_type_string); - set(av_get_pix_fmt); - set(av_get_pix_fmt_name); - set(av_get_sample_fmt); - set(av_get_sample_fmt_name); - set(av_get_time_base_q); - set(av_hwdevice_ctx_create); - set(av_hwframe_ctx_alloc); - set(av_hwframe_ctx_init); - set(av_hwframe_get_buffer); - set(av_log_get_level); - set(av_log_set_level); - set(av_malloc); - set(av_pix_fmt_desc_get); - set(av_rescale_q); - set(av_sample_fmt_is_planar); - set(av_strdup); - set(av_strerror); - set(avutil_version); -#undef set - -#define set(X) stub.X = (decltype(FFmpegStub::X))libavcodec.sym(#X) - set(av_codec_is_decoder); - set(av_codec_is_encoder); - set(av_codec_iterate); - set(av_packet_alloc); - set(av_packet_clone); - set(av_packet_free); - set(av_packet_ref); - set(av_packet_rescale_ts); - set(av_packet_unref); - set(avcodec_alloc_context3); - set(avcodec_configuration); - set(avcodec_descriptor_get); - set(avcodec_find_decoder); - set(avcodec_find_decoder_by_name); - set(avcodec_find_encoder); - set(avcodec_find_encoder_by_name); - set(avcodec_flush_buffers); - set(avcodec_free_context); - set(avcodec_get_hw_config); - set(avcodec_get_name); - set(avcodec_open2); - set(avcodec_parameters_alloc); - set(avcodec_parameters_copy); - set(avcodec_parameters_free); - set(avcodec_parameters_from_context); - set(avcodec_parameters_to_context); - set(avcodec_receive_frame); - set(avcodec_receive_packet); - set(avcodec_send_frame); - set(avcodec_send_packet); - set(avcodec_version); -#undef set - -#define set(X) stub.X = (decltype(FFmpegStub::X))libavformat.sym(#X) - set(av_demuxer_iterate); - set(av_dump_format); - set(av_find_best_stream); - set(av_find_input_format); - set(av_guess_frame_rate); - set(av_interleaved_write_frame); - set(av_muxer_iterate); - set(av_read_frame); - set(av_seek_frame); - set(av_write_trailer); - set(avio_alloc_context); - set(avio_enum_protocols); - set(avio_closep); - set(avio_flush); - set(avio_open2); - set(avformat_alloc_context); - set(avformat_alloc_output_context2); - set(avformat_close_input); - set(avformat_find_stream_info); - set(avformat_free_context); - set(avformat_new_stream); - set(avformat_open_input); - set(avformat_version); - set(avformat_write_header); -#undef set - -#define set(X) stub.X = (decltype(FFmpegStub::X))libavdevice.sym(#X) - set(avdevice_register_all); - set(avdevice_version); -#undef set - -#define set(X) stub.X = (decltype(FFmpegStub::X))libavfilter.sym(#X) - set(av_buffersink_get_frame); - set(av_buffersrc_add_frame_flags); - set(avfilter_get_by_name); - set(avfilter_graph_alloc); - set(avfilter_graph_config); - set(avfilter_graph_create_filter); - set(avfilter_graph_free); - set(avfilter_graph_parse_ptr); - set(avfilter_inout_alloc); - set(avfilter_inout_free); - set(avfilter_version); -#undef set - } -}; - -static std::unique_ptr _stub; - -void _init_stub() { -#if defined(_WIN32) - _stub = std::make_unique( - "avutil-" AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dll", - "avcodec-" AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dll", - "avformat-" AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dll", - "avdevice-" AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dll", - "avfilter-" AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dll"); -#elif defined(__APPLE__) - _stub = std::make_unique( - "libavutil." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR) ".dylib", - "libavcodec." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR) ".dylib", - "libavformat." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR) ".dylib", - "libavdevice." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR) ".dylib", - "libavfilter." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR) ".dylib"); -#else - _stub = std::make_unique( - "libavutil.so." AV_STRINGIFY(LIBAVUTIL_VERSION_MAJOR), - "libavcodec.so." AV_STRINGIFY(LIBAVCODEC_VERSION_MAJOR), - "libavformat.so." AV_STRINGIFY(LIBAVFORMAT_VERSION_MAJOR), - "libavdevice.so." AV_STRINGIFY(LIBAVDEVICE_VERSION_MAJOR), - "libavfilter.so." AV_STRINGIFY(LIBAVFILTER_VERSION_MAJOR)); -#endif -} - -} // namespace - -FFmpegStub& ffmpeg_stub() { - static c10::once_flag init_flag; - c10::call_once(init_flag, _init_stub); - return _stub->stub; -} - -} // namespace torchaudio::io::detail - -#endif diff --git a/torchaudio/csrc/ffmpeg/stub.h b/torchaudio/csrc/ffmpeg/stub.h deleted file mode 100644 index ae6e0a3d1c..0000000000 --- a/torchaudio/csrc/ffmpeg/stub.h +++ /dev/null @@ -1,313 +0,0 @@ -#pragma once - -// Abstraction of the access to FFmpeg libraries. -// -// Do not include this in header files. -// Include this header in implementation files and prepend -// all the calls to libav functions with FFMPEG macro. -// -// If DLOPEN_FFMPEG is not defined, FFMPEG macro is empty. -// In this case, FFmpeg libraries are linked at the time torchaudio is built. -// -// If DLOPEN_FFMPEG is defined, FFMPEG macro becomes a function call to -// fetch a stub instance of FFmpeg libraries. -// This function also initializes the function pointers by automatically -// dlopens all the required libraries. -// - -#ifndef DLOPEN_FFMPEG -#define FFMPEG -#else -#define FFMPEG detail::ffmpeg_stub(). - -#include - -namespace torchaudio::io::detail { - -struct FFmpegStub; - -// dlopen FFmpeg libraries and populate the methods of stub instance, -// then return the reference to the stub instance -FFmpegStub& ffmpeg_stub(); - -struct FFmpegStub { - ///////////////////////////////////////////////////////////////////////////// - // libavutil - ///////////////////////////////////////////////////////////////////////////// - - AVBufferRef* (*av_buffer_ref)(const AVBufferRef*); - - void (*av_buffer_unref)(AVBufferRef**); - - AVRational (*av_d2q)(double, int) av_const; - - void (*av_dict_free)(AVDictionary**); - - AVDictionaryEntry* (*av_dict_get)( - const AVDictionary*, - const char*, - const AVDictionaryEntry*, - int); - - int (*av_dict_set)(AVDictionary**, const char*, const char*, int); - - AVFrame* (*av_frame_alloc)(); - - void (*av_frame_free)(AVFrame**); - - int (*av_frame_get_buffer)(AVFrame*, int); - - int (*av_frame_is_writable)(AVFrame*); - - int (*av_frame_make_writable)(AVFrame*); - - void (*av_frame_unref)(AVFrame*); - - void (*av_freep)(void*); - - int (*av_get_channel_layout_nb_channels)(uint64_t); - - const char* (*av_get_channel_name)(uint64_t); - - int64_t (*av_get_default_channel_layout)(int); - - const char* (*av_get_media_type_string)(enum AVMediaType); - - enum AVPixelFormat (*av_get_pix_fmt)(const char*); - - const char* (*av_get_pix_fmt_name)(enum AVPixelFormat); - - enum AVSampleFormat (*av_get_sample_fmt)(const char*); - - const char* (*av_get_sample_fmt_name)(enum AVSampleFormat); - - AVRational (*av_get_time_base_q)(); - - int (*av_hwdevice_ctx_create)( - AVBufferRef**, - enum AVHWDeviceType, - const char*, - AVDictionary*, - int); - - AVBufferRef* (*av_hwframe_ctx_alloc)(AVBufferRef*); - - int (*av_hwframe_ctx_init)(AVBufferRef*); - - int (*av_hwframe_get_buffer)(AVBufferRef*, AVFrame*, int); - - int (*av_log_get_level)(); - - void (*av_log_set_level)(int); - - void* (*av_malloc)(size_t); - - const AVPixFmtDescriptor* (*av_pix_fmt_desc_get)(enum AVPixelFormat); - - int64_t (*av_rescale_q)(int64_t, AVRational, AVRational) av_const; - - int (*av_sample_fmt_is_planar)(enum AVSampleFormat); - - char* (*av_strdup)(const char*); - - int (*av_strerror)(int, char*, size_t); - - unsigned (*avutil_version)(); - - ///////////////////////////////////////////////////////////////////////////// - // libavcodec - ///////////////////////////////////////////////////////////////////////////// - - int (*av_codec_is_decoder)(const AVCodec*); - - int (*av_codec_is_encoder)(const AVCodec*); - - const AVCodec* (*av_codec_iterate)(void**); - - AVPacket* (*av_packet_alloc)(); - - AVPacket* (*av_packet_clone)(const AVPacket*); - - void (*av_packet_free)(AVPacket**); - - int (*av_packet_ref)(AVPacket*, const AVPacket*); - - void (*av_packet_rescale_ts)(AVPacket*, AVRational, AVRational); - - void (*av_packet_unref)(AVPacket*); - - AVCodecContext* (*avcodec_alloc_context3)(const AVCodec*); - - const char* (*avcodec_configuration)(); - - const AVCodecDescriptor* (*avcodec_descriptor_get)(enum AVCodecID); - - AVCodec* (*avcodec_find_decoder)(enum AVCodecID); - - AVCodec* (*avcodec_find_decoder_by_name)(const char*); - - AVCodec* (*avcodec_find_encoder)(enum AVCodecID); - - AVCodec* (*avcodec_find_encoder_by_name)(const char*); - - void (*avcodec_flush_buffers)(AVCodecContext*); - - void (*avcodec_free_context)(AVCodecContext**); - - const AVCodecHWConfig* (*avcodec_get_hw_config)(const AVCodec*, int); - - const char* (*avcodec_get_name)(enum AVCodecID); - - int (*avcodec_open2)(AVCodecContext*, const AVCodec*, AVDictionary**); - - AVCodecParameters* (*avcodec_parameters_alloc)(); - - int (*avcodec_parameters_copy)(AVCodecParameters*, const AVCodecParameters*); - - void (*avcodec_parameters_free)(AVCodecParameters**); - - int (*avcodec_parameters_from_context)( - AVCodecParameters*, - const AVCodecContext*); - - int (*avcodec_parameters_to_context)( - AVCodecContext*, - const AVCodecParameters*); - - int (*avcodec_receive_frame)(AVCodecContext*, AVFrame*); - - int (*avcodec_receive_packet)(AVCodecContext*, AVPacket*); - - int (*avcodec_send_frame)(AVCodecContext*, const AVFrame*); - - int (*avcodec_send_packet)(AVCodecContext*, const AVPacket*); - - unsigned (*avcodec_version)(); - - ///////////////////////////////////////////////////////////////////////////// - // libavformat - ///////////////////////////////////////////////////////////////////////////// - - const AVInputFormat* (*av_demuxer_iterate)(void**); - - void (*av_dump_format)(AVFormatContext*, int, const char*, int); - - int (*av_find_best_stream)( - AVFormatContext*, - enum AVMediaType, - int, - int, - AVCodec**, - int); - - AVInputFormat* (*av_find_input_format)(const char*); - - AVRational (*av_guess_frame_rate)(AVFormatContext*, AVStream*, AVFrame*); - - int (*av_interleaved_write_frame)(AVFormatContext*, AVPacket*); - - const AVOutputFormat* (*av_muxer_iterate)(void**); - - int (*av_read_frame)(AVFormatContext*, AVPacket*); - - int (*av_seek_frame)(AVFormatContext*, int, int64_t, int); - - int (*av_write_trailer)(AVFormatContext* s); - - AVIOContext* (*avio_alloc_context)( - unsigned char*, - int, - int, - void*, - int (*)(void*, uint8_t*, int), - int (*)(void*, uint8_t*, int), - int64_t (*)(void*, int64_t, int)); - - const char* (*avio_enum_protocols)(void**, int); - - int (*avio_closep)(AVIOContext**); - - void (*avio_flush)(AVIOContext*); - - int (*avio_open2)( - AVIOContext**, - const char*, - int, - const AVIOInterruptCB*, - AVDictionary**); - - AVFormatContext* (*avformat_alloc_context)(); - - int (*avformat_alloc_output_context2)( - AVFormatContext**, - AVOutputFormat*, - const char*, - const char*); - - void (*avformat_close_input)(AVFormatContext**); - - int (*avformat_find_stream_info)(AVFormatContext*, AVDictionary**); - - void (*avformat_free_context)(AVFormatContext*); - - AVStream* (*avformat_new_stream)(AVFormatContext*, const AVCodec*); - - int (*avformat_open_input)( - AVFormatContext**, - const char*, - AVFORMAT_CONST AVInputFormat*, - AVDictionary**); - - unsigned (*avformat_version)(); - - int (*avformat_write_header)(AVFormatContext*, AVDictionary**); - - ///////////////////////////////////////////////////////////////////////////// - // libavdevice - ///////////////////////////////////////////////////////////////////////////// - - void (*avdevice_register_all)(); - - unsigned (*avdevice_version)(); - - ///////////////////////////////////////////////////////////////////////////// - // libavfilter - ///////////////////////////////////////////////////////////////////////////// - - int (*av_buffersink_get_frame)(AVFilterContext*, AVFrame*); - - int (*av_buffersrc_add_frame_flags)(AVFilterContext*, AVFrame*, int); - - const AVFilter* (*avfilter_get_by_name)(const char*); - - AVFilterGraph* (*avfilter_graph_alloc)(); - - int (*avfilter_graph_config)(AVFilterGraph*, void*); - - int (*avfilter_graph_create_filter)( - AVFilterContext**, - const AVFilter*, - const char*, - const char*, - void*, - AVFilterGraph*); - - void (*avfilter_graph_free)(AVFilterGraph**); - - int (*avfilter_graph_parse_ptr)( - AVFilterGraph*, - const char*, - AVFilterInOut**, - AVFilterInOut**, - void*); - - AVFilterInOut* (*avfilter_inout_alloc)(); - - void (*avfilter_inout_free)(AVFilterInOut**); - - unsigned (*avfilter_version)(); -}; - -} // namespace torchaudio::io::detail - -#endif diff --git a/torchaudio/csrc/forced_align/cpu/compute.cpp b/torchaudio/csrc/forced_align/cpu/compute.cpp index da42cf942c..d9f735af47 100644 --- a/torchaudio/csrc/forced_align/cpu/compute.cpp +++ b/torchaudio/csrc/forced_align/cpu/compute.cpp @@ -17,8 +17,10 @@ void forced_align_impl( const scalar_t kNegInfinity = -std::numeric_limits::infinity(); using target_t = typename std:: conditional::type; - const auto T = logProbs.size(0); - const auto L = targets.size(0); + const auto batchIndex = + 0; // TODO: support batch version and use the real batch index + const auto T = logProbs.size(1); + const auto L = targets.size(1); const auto S = 2 * L + 1; torch::Tensor alphas = torch::empty( {2, S}, @@ -27,14 +29,14 @@ void forced_align_impl( .dtype(logProbs.dtype())) .fill_(kNegInfinity); torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1); - auto logProbs_a = logProbs.accessor(); - auto targets_a = targets.accessor(); - auto paths_a = paths.accessor(); + auto logProbs_a = logProbs.accessor(); + auto targets_a = targets.accessor(); + auto paths_a = paths.accessor(); auto alphas_a = alphas.accessor(); auto backPtr_a = backPtr.accessor(); auto R = 0; for (auto i = 1; i < L; i++) { - if (targets_a[i] == targets_a[i - 1]) { + if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) { ++R; } } @@ -49,20 +51,22 @@ void forced_align_impl( auto start = T - (L + R) > 0 ? 0 : 1; auto end = (S == 1) ? 1 : 2; for (auto i = start; i < end; i++) { - auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2]; - alphas_a[0][i] = logProbs_a[0][labelIdx]; + auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; + alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx]; } for (auto t = 1; t < T; t++) { if (T - t <= L + R) { if ((start % 2 == 1) && - targets_a[start / 2] != targets_a[start / 2 + 1]) { + targets_a[batchIndex][start / 2] != + targets_a[batchIndex][start / 2 + 1]) { start = start + 1; } start = start + 1; } if (t <= L + R) { if (end % 2 == 0 && end < 2 * L && - targets_a[end / 2 - 1] != targets_a[end / 2]) { + targets_a[batchIndex][end / 2 - 1] != + targets_a[batchIndex][end / 2]) { end = end + 1; } end = end + 1; @@ -75,7 +79,7 @@ void forced_align_impl( } if (start == 0) { alphas_a[curIdxOffset][0] = - alphas_a[prevIdxOffset][0] + logProbs_a[t][blank]; + alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; backPtr_a[t][0] = 0; startloop += 1; } @@ -85,13 +89,14 @@ void forced_align_impl( auto x1 = alphas_a[prevIdxOffset][i - 1]; auto x2 = -std::numeric_limits::infinity(); - auto labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2]; + auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; // In CTC, the optimal path may optionally chose to skip a blank label. // x2 represents skipping a letter, and can only happen if we're not // currently on a blank_label, and we're not on a repeat letter // (i != 1) just ensures we don't access targets[i - 2] if its i < 2 - if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) { + if (i % 2 != 0 && i != 1 && + targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) { x2 = alphas_a[prevIdxOffset][i - 2]; } scalar_t result = 0.0; @@ -105,7 +110,7 @@ void forced_align_impl( result = x0; backPtr_a[t][i] = 0; } - alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx]; + alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; } } auto idx1 = (T - 1) % 2; @@ -113,8 +118,8 @@ void forced_align_impl( // path stores the token index for each time step after force alignment. auto indexScores = 0; for (auto t = T - 1; t > -1; t--) { - auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[ltrIdx / 2]; - paths_a[t] = lbl_idx; + auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; + paths_a[batchIndex][t] = lbl_idx; ++indexScores; ltrIdx -= backPtr_a[t][ltrIdx]; } @@ -142,30 +147,35 @@ std::tuple compute( TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous"); TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous"); TORCH_CHECK( - logProbs.dim() != 3, - "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.") + logProbs.dim() == 3, + "log_probs must be 3-D (batch_size, input length, num classes)"); TORCH_CHECK( - targets.dim() != 2, - "2-D tensor is not yet supported for targets, please provide 1-D tensor.") + targets.dim() == 2, "targets must be 2-D (batch_size, target length,)"); TORCH_CHECK( - logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)"); - TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)"); - TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D"); - TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D"); + inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)"); + TORCH_CHECK( + targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)"); + TORCH_CHECK( + logProbs.size(0) == 1, + "The batch dimension for log_probs must be 1 at the current version.") + TORCH_CHECK( + targets.size(0) == 1, + "The batch dimension for targets must be 1 at the current version.") TORCH_CHECK( blank >= 0 && blank < logProbs.size(-1), "blank must be within [0, num classes)"); TORCH_CHECK( - logProbs.size(0) == at::max(inputLengths).item().toInt(), + logProbs.size(1) == at::max(inputLengths).item().toInt(), "input length mismatch"); TORCH_CHECK( - targets.size(0) == at::max(targetLengths).item().toInt(), + targets.size(1) == at::max(targetLengths).item().toInt(), "target length mismatch"); - const auto T = logProbs.size(0); + const auto B = logProbs.size(0); + const auto T = logProbs.size(1); auto paths = torch::zeros( - {T}, + {B, T}, torch::TensorOptions().device(targets.device()).dtype(targets.dtype())); AT_DISPATCH_FLOATING_TYPES_AND_HALF( logProbs.scalar_type(), "forced_align_impl", [&] { @@ -180,9 +190,10 @@ std::tuple compute( return std::make_tuple( paths, logProbs.index( - {torch::linspace( + {torch::indexing::Slice(), + torch::linspace( 0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())), - paths})); + paths.index({0})})); } TORCH_LIBRARY_IMPL(torchaudio, CPU, m) { diff --git a/torchaudio/csrc/forced_align/gpu/compute.cu b/torchaudio/csrc/forced_align/gpu/compute.cu index d869473831..b23d52f1f3 100644 --- a/torchaudio/csrc/forced_align/gpu/compute.cu +++ b/torchaudio/csrc/forced_align/gpu/compute.cu @@ -18,9 +18,9 @@ namespace alignment { namespace gpu { template __global__ void falign_cuda_step_kernel( - const torch::PackedTensorAccessor32 + const torch::PackedTensorAccessor32 logProbs_a, - const torch::PackedTensorAccessor32 + const torch::PackedTensorAccessor32 targets_a, const int T, const int L, @@ -36,6 +36,8 @@ __global__ void falign_cuda_step_kernel( torch::PackedTensorAccessor32 backPtrBuffer_a) { scalar_t kNegInfinity = -std::numeric_limits::infinity(); + const int batchIndex = + 0; // TODO: support batch version and use the real batch index int S = 2 * L + 1; int curIdxOffset = (t % 2); // current time step frame for alpha int prevIdxOffset = ((t - 1) % 2); // previous time step frame for alpha @@ -49,8 +51,8 @@ __global__ void falign_cuda_step_kernel( __syncthreads(); if (t == 0) { for (unsigned int i = start + threadIdx.x; i < end; i += blockDim.x) { - int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2]; - alphas_a[curIdxOffset][i] = logProbs_a[0][labelIdx]; + int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; + alphas_a[curIdxOffset][i] = logProbs_a[batchIndex][0][labelIdx]; } return; } @@ -62,7 +64,7 @@ __global__ void falign_cuda_step_kernel( threadMax = kNegInfinity; if (start == 0 && threadIdx.x == 0) { alphas_a[curIdxOffset][0] = - alphas_a[prevIdxOffset][0] + logProbs_a[t][blank]; + alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; threadMax = max(threadMax, alphas_a[curIdxOffset][0]); backPtrBuffer_a[backPtrBufferLen][0] = 0; } @@ -73,8 +75,9 @@ __global__ void falign_cuda_step_kernel( scalar_t x0 = alphas_a[prevIdxOffset][i]; scalar_t x1 = alphas_a[prevIdxOffset][i - 1]; scalar_t x2 = kNegInfinity; - int labelIdx = (i % 2 == 0) ? blank : targets_a[i / 2]; - if (i % 2 != 0 && i != 1 && targets_a[i / 2] != targets_a[i / 2 - 1]) { + int labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; + if (i % 2 != 0 && i != 1 && + targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) { x2 = alphas_a[prevIdxOffset][i - 2]; } scalar_t result = 0.0; @@ -88,7 +91,7 @@ __global__ void falign_cuda_step_kernel( result = x0; backPtrBuffer_a[backPtrBufferLen][i] = 0; } - alphas_a[curIdxOffset][i] = result + logProbs_a[t][labelIdx]; + alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; threadMax = max(threadMax, alphas_a[curIdxOffset][i]); } scalar_t maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max()); @@ -113,10 +116,12 @@ void forced_align_impl( const scalar_t kNegInfinity = -std::numeric_limits::infinity(); using target_t = typename std:: conditional::type; - auto paths_a = paths.accessor(); - const int T = logProbs.size(0); // num frames - const int N = logProbs.size(1); // alphabet size - const int L = targets.size(0); // label length + auto paths_a = paths.accessor(); + const int batchIndex = + 0; // TODO: support batch version and use the real batch index + const int T = logProbs.size(1); // num frames + const int N = logProbs.size(2); // alphabet size + const int L = targets.size(1); // label length const int S = 2 * L + 1; auto targetsCpu = targets.to(torch::kCPU); // backPtrBuffer stores the index offset fthe best path at current position @@ -144,12 +149,12 @@ void forced_align_impl( .device(logProbs.device())) .fill_(kNegInfinity); // CPU accessors - auto targetsCpu_a = targetsCpu.accessor(); + auto targetsCpu_a = targetsCpu.accessor(); auto backPtrCpu_a = backPtrCpu.accessor(); // count the number of repeats in label int R = 0; for (int i = 1; i < L; ++i) { - if (targetsCpu_a[i] == targetsCpu_a[i - 1]) { + if (targetsCpu_a[batchIndex][i] == targetsCpu_a[batchIndex][i - 1]) { ++R; } } @@ -169,14 +174,16 @@ void forced_align_impl( if (t > 0) { if (T - t <= L + R) { if ((start % 2 == 1) && - (targetsCpu_a[start / 2] != targetsCpu_a[start / 2 + 1])) { + (targetsCpu_a[batchIndex][start / 2] != + targetsCpu_a[batchIndex][start / 2 + 1])) { start = start + 1; } start = start + 1; } if (t <= L + R) { if ((end % 2 == 0) && (end < 2 * L) && - (targetsCpu_a[end / 2 - 1] != targetsCpu_a[end / 2])) { + (targetsCpu_a[batchIndex][end / 2 - 1] != + targetsCpu_a[batchIndex][end / 2])) { end = end + 1; } end = end + 1; @@ -184,8 +191,8 @@ void forced_align_impl( } falign_cuda_step_kernel <<<1, kNumThreads, 0, defaultStream>>>( - logProbs.packed_accessor32(), - targets.packed_accessor32(), + logProbs.packed_accessor32(), + targets.packed_accessor32(), T, L, N, @@ -229,8 +236,9 @@ void forced_align_impl( : S - 2; int indexScores = 0; for (int t = T - 1; t >= 0; --t) { - auto lbl_idx = ltrIdx % 2 == 0 ? blank : targetsCpu_a[ltrIdx / 2]; - paths_a[t] = lbl_idx; + auto lbl_idx = + ltrIdx % 2 == 0 ? blank : targetsCpu_a[batchIndex][ltrIdx / 2]; + paths_a[batchIndex][t] = lbl_idx; ++indexScores; ltrIdx -= backPtrCpu_a[t][ltrIdx]; } @@ -258,30 +266,36 @@ std::tuple compute( TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous"); TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous"); TORCH_CHECK( - logProbs.dim() != 3, - "3-D tensor is not yet supported for log_probs, please provide 2-D tensor.") + logProbs.dim() == 3, + "log_probs must be 3-D (batch_size, input length, num classes)"); TORCH_CHECK( - targets.dim() != 2, - "2-D tensor is not yet supported for targets, please provide 1-D tensor.") + targets.dim() == 2, "targets must be 2-D (batch_size, target length,)"); TORCH_CHECK( - logProbs.dim() == 2, "log_probs must be 2-D (input length, num classes)"); - TORCH_CHECK(targets.dim() == 1, "targets must be 1-D (target length,)"); - TORCH_CHECK(inputLengths.dim() == 0, "input_lengths must be 0-D"); - TORCH_CHECK(targetLengths.dim() == 0, "target_lengths must be 0-D"); + inputLengths.dim() == 1, "input_lengths must be 1-D (batch_size,)"); + TORCH_CHECK( + targetLengths.dim() == 1, "target_lengths must be 1-D (batch_size,)"); + TORCH_CHECK( + logProbs.size(0) == 1, + "The batch dimension for log_probs must be 1 at the current version.") + TORCH_CHECK( + targets.size(0) == 1, + "The batch dimension for targets must be 1 at the current version.") TORCH_CHECK( blank >= 0 && blank < logProbs.size(-1), "blank must be within [0, num classes)"); TORCH_CHECK( - logProbs.size(0) == at::max(inputLengths).item().toInt(), + logProbs.size(1) == at::max(inputLengths).item().toInt(), "input length mismatch"); TORCH_CHECK( - targets.size(0) == at::max(targetLengths).item().toInt(), + targets.size(1) == at::max(targetLengths).item().toInt(), "target length mismatch"); - auto T = logProbs.size(0); // num frames + auto B = logProbs.size(0); + auto T = logProbs.size(1); // num frames auto paths = torch::zeros( - {T}, torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype())); + {B, T}, + torch::TensorOptions().device(torch::kCPU).dtype(targets.dtype())); AT_DISPATCH_FLOATING_TYPES_AND_HALF( logProbs.scalar_type(), "forced_align_impl", [&] { if (targets.scalar_type() == torch::kInt64) { @@ -295,9 +309,10 @@ std::tuple compute( return std::make_tuple( paths.to(logProbs.device()), logProbs.index( - {torch::linspace( + {torch::indexing::Slice(), + torch::linspace( 0, T - 1, T, torch::TensorOptions().dtype(paths.dtype())), - paths})); + paths.index({0})})); } TORCH_LIBRARY_IMPL(torchaudio, CUDA, m) { diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py index e6457d299c..8b732cf663 100644 --- a/torchaudio/functional/functional.py +++ b/torchaudio/functional/functional.py @@ -2511,12 +2511,12 @@ def forced_align( Args: log_probs (torch.Tensor): log probability of CTC emission output. - Tensor of shape `(T, C)`. where `T` is the input length, + Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length, `C` is the number of characters in alphabet including blank. - targets (torch.Tensor): Target sequence. Tensor of shape `(L,)`, + targets (torch.Tensor): Target sequence. Tensor of shape `(B, L)`, where `L` is the target length. - input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 0-D Tensor (scalar). - target_lengths (torch.Tensor): Lengths of the targets. 0-D Tensor (scalar). + input_lengths (torch.Tensor): Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`. + target_lengths (torch.Tensor): Lengths of the targets. 1-D Tensor of shape `(B,)`. blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0) Returns: @@ -2534,6 +2534,9 @@ def forced_align( where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens. For example, in str `"aabbc"`, the number of repeats are `2`. + + Note: + The current version only supports ``batch_size``==1. """ if blank in targets: raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")