Use PyAV for atempo, volume filters

WyattBlue · Sep 18, 2024 · 393db50 · 393db50
1 parent ee492f2
commit 393db50
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 42 deletions.
diff --git a/auto_editor/render/audio.py b/auto_editor/render/audio.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+import io
 from pathlib import Path
 from platform import system
 from subprocess import PIPE
 
+import av
 import numpy as np
 
 from auto_editor.ffwrapper import FFmpeg, FileInfo
@@ -12,12 +14,12 @@
 from auto_editor.lib.contracts import andc, between_c, is_int_or_float
 from auto_editor.lib.err import MyError
 from auto_editor.output import Ensure
-from auto_editor.timeline import v3
+from auto_editor.timeline import TlAudio, v3
 from auto_editor.utils.bar import Bar
 from auto_editor.utils.cmdkw import ParserError, parse_with_palet, pAttr, pAttrs
 from auto_editor.utils.log import Log
 from auto_editor.utils.types import Args
-from auto_editor.wavfile import AudioData, read, write
+from auto_editor.wavfile import AudioData, read, read_fid, write
 
 norm_types = {
     "ebu": pAttrs(
@@ -165,6 +167,72 @@ def apply_audio_normalization(
     ffmpeg.run(["-i", f"{pre_master}"] + cmd + [f"{path}"])
 
 
+def process_audio_clip(
+    clip: TlAudio,
+    samp_list: AudioData,
+    samp_start: int,
+    samp_end: int,
+    sr: int,
+    temp: str,
+) -> AudioData:
+    input_path = Path(temp, "input.wav")
+
+    with open(input_path, "wb") as fid:
+        write(fid, sr, samp_list[samp_start:samp_end])
+
+    input_file = av.open(input_path, "r")
+    input_stream = input_file.streams.audio[0]
+
+    output_bytes = io.BytesIO()
+    output_file = av.open(output_bytes, mode="w", format="wav")
+    output_stream = output_file.add_stream("pcm_s16le", rate=sr)
+    assert isinstance(output_stream, av.audio.AudioStream)
+
+    graph = av.filter.Graph()
+    args = [graph.add_abuffer(template=input_stream)]
+
+    if clip.speed != 1:
+        if clip.speed > 10_000:
+            for _ in range(3):
+                args.append(graph.add("atempo", f"{clip.speed ** (1/3)}"))
+        elif clip.speed > 100:
+            for _ in range(2):
+                args.append(graph.add("atempo", f"{clip.speed ** 0.5}"))
+        elif clip.speed >= 0.5:
+            args.append(graph.add("atempo", f"{clip.speed}"))
+        else:
+            start = 0.5
+            while start * 0.5 > clip.speed:
+                start *= 0.5
+                args.append(graph.add("atempo", "0.5"))
+            args.append(graph.add("atempo", f"{clip.speed / start}"))
+
+    if clip.volume != 1:
+        args.append(graph.add("volume", f"{clip.volume}"))
+
+    args.append(graph.add("abuffersink"))
+    graph.link_nodes(*args).configure()
+
+    for frame in input_file.decode(input_stream):
+        graph.push(frame)
+        while True:
+            try:
+                for packet in output_stream.encode(graph.pull()):
+                    output_file.mux(packet)
+            except (av.BlockingIOError, av.EOFError):
+                break
+
+    # Flush the stream
+    for packet in output_stream.encode(None):
+        output_file.mux(packet)
+
+    input_file.close()
+    output_file.close()
+
+    output_bytes.seek(0)
+    return read_fid(io.BufferedReader(output_bytes))[1]
+
+
 def make_new_audio(
     tl: v3, ensure: Ensure, args: Args, ffmpeg: FFmpeg, bar: Bar, log: Log
 ) -> list[str]:
@@ -175,7 +243,6 @@ def make_new_audio(
 
     norm = parse_norm(args.audio_normalize, log)
 
-    af_tick = 0
     temp = log.temp
 
     if not tl.a or not tl.a[0]:
@@ -214,42 +281,12 @@ def make_new_audio(
             if samp_end > len(samp_list):
                 samp_end = len(samp_list)
 
-            filters: list[str] = []
-
-            if clip.speed != 1:
-                if clip.speed > 10_000:
-                    filters.extend([f"atempo={clip.speed}^.33333"] * 3)
-                elif clip.speed > 100:
-                    filters.extend(
-                        [f"atempo=sqrt({clip.speed})", f"atempo=sqrt({clip.speed})"]
-                    )
-                elif clip.speed >= 0.5:
-                    filters.append(f"atempo={clip.speed}")
-                else:
-                    start = 0.5
-                    while start * 0.5 > clip.speed:
-                        start *= 0.5
-                        filters.append("atempo=0.5")
-                    filters.append(f"atempo={clip.speed / start}")
-
-            if clip.volume != 1:
-                filters.append(f"volume={clip.volume}")
-
-            if not filters:
-                clip_arr = samp_list[samp_start:samp_end]
+            if clip.speed != 1 or clip.volume != 1:
+                clip_arr = process_audio_clip(
+                    clip, samp_list, samp_start, samp_end, sr, temp
+                )
             else:
-                af = Path(temp, f"af{af_tick}.wav")
-                af_out = Path(temp, f"af{af_tick}_out.wav")
-
-                # Windows can't replace a file that's already in use, so we have to
-                # cycle through file names.
-                af_tick = (af_tick + 1) % 3
-
-                with open(af, "wb") as fid:
-                    write(fid, sr, samp_list[samp_start:samp_end])
-
-                ffmpeg.run(["-i", f"{af}", "-af", ",".join(filters), f"{af_out}"])
-                clip_arr = read(f"{af_out}")[1]
+                clip_arr = samp_list[samp_start:samp_end]
 
             # Mix numpy arrays
             start = clip.start * sr // tb

diff --git a/auto_editor/wavfile.py b/auto_editor/wavfile.py
@@ -114,10 +114,24 @@ def _read_data_chunk(
     else:
         n_samples = (size - 1) // block_align
 
-    data = np.memmap(
-        fid, dtype=dtype, mode="c", offset=fid.tell(), shape=(n_samples, channels)
-    )
-    fid.seek(size, 1)
+    assert isinstance(fid, io.BufferedIOBase)
+
+    try:
+        fid.fileno()
+        is_file = True
+    except io.UnsupportedOperation:
+        is_file = False
+
+    if is_file:
+        data: AudioData = np.memmap(
+            fid, dtype=dtype, mode="c", offset=fid.tell(), shape=(n_samples, channels)
+        )
+        fid.seek(size, 1)
+    else:
+        bytes_per_sample = np.dtype(dtype).itemsize
+        buffer = fid.read(n_samples * channels * bytes_per_sample)
+        data = np.frombuffer(buffer, dtype=dtype).reshape((n_samples, channels))
+
     _handle_pad_byte(fid, size)
 
     return data
@@ -191,7 +205,10 @@ def _handle_pad_byte(fid: io.BufferedReader, size: int) -> None:
 
 def read(filename: str) -> tuple[int, AudioData]:
     fid = open(filename, "rb")
+    return read_fid(fid)
+
 
+def read_fid(fid: io.BufferedReader) -> tuple[int, AudioData]:
     file_sig = fid.read(4)
     if file_sig in (b"RIFF", b"RIFX"):
         data_size, file_size, en = _read_riff_chunk(file_sig, fid)