Use PyAV for atempo, volume filters

WyattBlue · Sep 18, 2024 · 0bf09fb · 0bf09fb
1 parent ee492f2
commit 0bf09fb
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 50 deletions.
diff --git a/auto_editor/render/audio.py b/auto_editor/render/audio.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+import io
 from pathlib import Path
 from platform import system
 from subprocess import PIPE
 
+import av
 import numpy as np
 
 from auto_editor.ffwrapper import FFmpeg, FileInfo
@@ -12,12 +14,12 @@
 from auto_editor.lib.contracts import andc, between_c, is_int_or_float
 from auto_editor.lib.err import MyError
 from auto_editor.output import Ensure
-from auto_editor.timeline import v3
+from auto_editor.timeline import TlAudio, v3
 from auto_editor.utils.bar import Bar
 from auto_editor.utils.cmdkw import ParserError, parse_with_palet, pAttr, pAttrs
 from auto_editor.utils.log import Log
 from auto_editor.utils.types import Args
-from auto_editor.wavfile import AudioData, read, write
+from auto_editor.wavfile import AudioData, read, read_fid, write
 
 norm_types = {
     "ebu": pAttrs(
@@ -165,6 +167,68 @@ def apply_audio_normalization(
     ffmpeg.run(["-i", f"{pre_master}"] + cmd + [f"{path}"])
 
 
+def process_audio_clip(
+    clip: TlAudio, samp_list: AudioData, samp_start: int, samp_end: int, sr: int
+) -> AudioData:
+    input_buffer = io.BytesIO()
+    write(input_buffer, sr, samp_list[samp_start:samp_end])
+    input_buffer.seek(0)
+
+    input_file = av.open(input_buffer, "r")
+    input_stream = input_file.streams.audio[0]
+
+    output_bytes = io.BytesIO()
+    output_file = av.open(output_bytes, mode="w", format="wav")
+    output_stream = output_file.add_stream("pcm_s16le", rate=sr)
+    assert isinstance(output_stream, av.audio.AudioStream)
+
+    graph = av.filter.Graph()
+    args = [graph.add_abuffer(template=input_stream)]
+
+    if clip.speed != 1:
+        if clip.speed > 10_000:
+            for _ in range(3):
+                args.append(graph.add("atempo", f"{clip.speed ** (1/3)}"))
+        elif clip.speed > 100:
+            for _ in range(2):
+                args.append(graph.add("atempo", f"{clip.speed ** 0.5}"))
+        elif clip.speed >= 0.5:
+            args.append(graph.add("atempo", f"{clip.speed}"))
+        else:
+            start = 0.5
+            while start * 0.5 > clip.speed:
+                start *= 0.5
+                args.append(graph.add("atempo", "0.5"))
+            args.append(graph.add("atempo", f"{clip.speed / start}"))
+
+    if clip.volume != 1:
+        args.append(graph.add("volume", f"{clip.volume}"))
+
+    args.append(graph.add("abuffersink"))
+    graph.link_nodes(*args).configure()
+
+    for frame in input_file.decode(input_stream):
+        graph.push(frame)
+        while True:
+            try:
+                aframe = graph.pull()
+                assert isinstance(aframe, av.audio.AudioFrame)
+                for packet in output_stream.encode(aframe):
+                    output_file.mux(packet)
+            except (av.BlockingIOError, av.EOFError):
+                break
+
+    # Flush the stream
+    for packet in output_stream.encode(None):
+        output_file.mux(packet)
+
+    input_file.close()
+    output_file.close()
+
+    output_bytes.seek(0)
+    return read_fid(output_bytes)[1]
+
+
 def make_new_audio(
     tl: v3, ensure: Ensure, args: Args, ffmpeg: FFmpeg, bar: Bar, log: Log
 ) -> list[str]:
@@ -175,7 +239,6 @@ def make_new_audio(
 
     norm = parse_norm(args.audio_normalize, log)
 
-    af_tick = 0
     temp = log.temp
 
     if not tl.a or not tl.a[0]:
@@ -214,42 +277,10 @@ def make_new_audio(
             if samp_end > len(samp_list):
                 samp_end = len(samp_list)
 
-            filters: list[str] = []
-
-            if clip.speed != 1:
-                if clip.speed > 10_000:
-                    filters.extend([f"atempo={clip.speed}^.33333"] * 3)
-                elif clip.speed > 100:
-                    filters.extend(
-                        [f"atempo=sqrt({clip.speed})", f"atempo=sqrt({clip.speed})"]
-                    )
-                elif clip.speed >= 0.5:
-                    filters.append(f"atempo={clip.speed}")
-                else:
-                    start = 0.5
-                    while start * 0.5 > clip.speed:
-                        start *= 0.5
-                        filters.append("atempo=0.5")
-                    filters.append(f"atempo={clip.speed / start}")
-
-            if clip.volume != 1:
-                filters.append(f"volume={clip.volume}")
-
-            if not filters:
-                clip_arr = samp_list[samp_start:samp_end]
+            if clip.speed != 1 or clip.volume != 1:
+                clip_arr = process_audio_clip(clip, samp_list, samp_start, samp_end, sr)
             else:
-                af = Path(temp, f"af{af_tick}.wav")
-                af_out = Path(temp, f"af{af_tick}_out.wav")
-
-                # Windows can't replace a file that's already in use, so we have to
-                # cycle through file names.
-                af_tick = (af_tick + 1) % 3
-
-                with open(af, "wb") as fid:
-                    write(fid, sr, samp_list[samp_start:samp_end])
-
-                ffmpeg.run(["-i", f"{af}", "-af", ",".join(filters), f"{af_out}"])
-                clip_arr = read(f"{af_out}")[1]
+                clip_arr = samp_list[samp_start:samp_end]
 
             # Mix numpy arrays
             start = clip.start * sr // tb

diff --git a/auto_editor/wavfile.py b/auto_editor/wavfile.py
@@ -3,7 +3,7 @@
 import io
 import struct
 import sys
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
@@ -15,13 +15,17 @@
 Endian = Literal[">", "<"]  # Big Endian, Little Endian
 ByteOrd = Literal["big", "little"]
 
+if TYPE_CHECKING:
+    Reader = io.BufferedReader | io.BytesIO
+    Writer = io.BufferedWriter | io.BytesIO
+
 
 class WavError(Exception):
     pass
 
 
 def _read_fmt_chunk(
-    fid: io.BufferedReader, bytes_order: ByteOrd
+    fid: Reader, bytes_order: ByteOrd
 ) -> tuple[int, int, int, int, int]:
     size = int.from_bytes(fid.read(4), bytes_order)
 
@@ -69,7 +73,7 @@ def _read_fmt_chunk(
 
 
 def _read_data_chunk(
-    fid: io.BufferedReader,
+    fid: Reader,
     format_tag: int,
     channels: int,
     bit_depth: int,
@@ -114,16 +118,22 @@ def _read_data_chunk(
     else:
         n_samples = (size - 1) // block_align
 
-    data = np.memmap(
-        fid, dtype=dtype, mode="c", offset=fid.tell(), shape=(n_samples, channels)
-    )
-    fid.seek(size, 1)
+    if isinstance(fid, io.BufferedReader):
+        data: AudioData = np.memmap(
+            fid, dtype=dtype, mode="c", offset=fid.tell(), shape=(n_samples, channels)
+        )
+        fid.seek(size, 1)
+    else:
+        bytes_per_sample = np.dtype(dtype).itemsize
+        buffer = fid.read(n_samples * channels * bytes_per_sample)
+        data = np.frombuffer(buffer, dtype=dtype).reshape((n_samples, channels))
+
     _handle_pad_byte(fid, size)
 
     return data
 
 
-def _skip_unknown_chunk(fid: io.BufferedReader, en: Endian) -> None:
+def _skip_unknown_chunk(fid: Reader, en: Endian) -> None:
     data = fid.read(4)
 
     if len(data) == 4:
@@ -140,7 +150,7 @@ def _skip_unknown_chunk(fid: io.BufferedReader, en: Endian) -> None:
         )
 
 
-def _read_rf64_chunk(fid: io.BufferedReader) -> tuple[int, int, Endian]:
+def _read_rf64_chunk(fid: Reader) -> tuple[int, int, Endian]:
     # https://tech.ebu.ch/docs/tech/tech3306v1_0.pdf
     # https://www.itu.int/dms_pubrec/itu-r/rec/bs/R-REC-BS.2088-1-201910-I!!PDF-E.pdf
 
@@ -171,7 +181,7 @@ def _read_rf64_chunk(fid: io.BufferedReader) -> tuple[int, int, Endian]:
     return data_size, file_size, en
 
 
-def _read_riff_chunk(sig: bytes, fid: io.BufferedReader) -> tuple[None, int, Endian]:
+def _read_riff_chunk(sig: bytes, fid: Reader) -> tuple[None, int, Endian]:
     en: Endian = "<" if sig == b"RIFF" else ">"
     bytes_order: ByteOrd = "big" if en == ">" else "little"
 
@@ -184,14 +194,17 @@ def _read_riff_chunk(sig: bytes, fid: io.BufferedReader) -> tuple[None, int, End
     return None, file_size, en
 
 
-def _handle_pad_byte(fid: io.BufferedReader, size: int) -> None:
+def _handle_pad_byte(fid: Reader, size: int) -> None:
     if size % 2 == 1:
         fid.seek(1, 1)
 
 
 def read(filename: str) -> tuple[int, AudioData]:
     fid = open(filename, "rb")
+    return read_fid(fid)
+
 
+def read_fid(fid: Reader) -> tuple[int, AudioData]:
     file_sig = fid.read(4)
     if file_sig in (b"RIFF", b"RIFX"):
         data_size, file_size, en = _read_riff_chunk(file_sig, fid)
@@ -241,7 +254,7 @@ def read(filename: str) -> tuple[int, AudioData]:
     raise WavError("Found no data")
 
 
-def write(fid: io.BufferedWriter, sr: int, arr: np.ndarray) -> None:
+def write(fid: Writer, sr: int, arr: np.ndarray) -> None:
     channels = 1 if arr.ndim == 1 else arr.shape[1]
     bit_depth = arr.dtype.itemsize * 8
     block_align = channels * (bit_depth // 8)