From 868dc875cc61597bff24a094ca855b9929ee2986 Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Wed, 6 Dec 2023 12:46:59 +0100
Subject: [PATCH 1/6] =?UTF-8?q?=E2=9C=A8=20Add=20basic=20video=20preview?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/src/editor/player.tsx  |  1 +
 frontend/src/utils/use_audio.ts | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
index 88a46ca2..4052fe96 100644
--- a/frontend/src/editor/player.tsx
+++ b/frontend/src/editor/player.tsx
@@ -49,6 +49,7 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
   const audio = useAudio({
     playbackRate,
     sources,
+    videoPreview: true,
   });
 
   // calculate the start of the current element to color it
diff --git a/frontend/src/utils/use_audio.ts b/frontend/src/utils/use_audio.ts
index 6e934b29..7fe1ca9c 100644
--- a/frontend/src/utils/use_audio.ts
+++ b/frontend/src/utils/use_audio.ts
@@ -1,12 +1,13 @@
-import { actions, audio, events, props } from '@podlove/html5-audio-driver';
+import { actions, video, events, props, audio } from '@podlove/html5-audio-driver';
 import { useCallback, useEffect, useRef, useState } from 'react';
 
 type UseAudioOptions = {
   playbackRate?: number;
   sources: Array<{ src: string; type: string }>;
+  videoPreview?: boolean;
 };
 
-export function useAudio({ sources, playbackRate }: UseAudioOptions) {
+export function useAudio({ sources, playbackRate, videoPreview }: UseAudioOptions) {
   const [playing, setPlayingState] = useState(false);
   const [duration, setDuration] = useState<number | undefined>();
   const [buffering, setBuffering] = useState(false);
@@ -16,11 +17,22 @@ export function useAudio({ sources, playbackRate }: UseAudioOptions) {
   const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
 
   useEffect(() => {
-    const myAudioElement = audio([]);
+    const myAudioElement = videoPreview ? video([]) : audio([]);
+
     setAudioElement(myAudioElement);
 
     const e = events(myAudioElement);
     e.onDurationChange(() => {
+      if (videoPreview && myAudioElement.videoHeight > 0) {
+        myAudioElement.style = `
+          position: fixed;
+          bottom: 90px;
+          right: 20px;
+          height: 170px;
+          width: 300px;
+        `;
+      }
+
       setDuration(props(myAudioElement).duration);
     });
     e.onPlay(() => setPlayingState(true));

From 8c05d9d6b6a010a3a3089484a7eae783d4334d25 Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Thu, 7 Dec 2023 02:44:59 +0100
Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=A7=B1=20Include=20video=20in=20reenc?=
 =?UTF-8?q?oded=20preview=20media?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/src/editor/player.tsx        | 17 +++++++------
 worker/transcribee_worker/config.py   |  5 +++-
 worker/transcribee_worker/reencode.py | 36 +++++++++++++++++----------
 worker/transcribee_worker/worker.py   |  3 ++-
 4 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
index 4052fe96..a6a59fd0 100644
--- a/frontend/src/editor/player.tsx
+++ b/frontend/src/editor/player.tsx
@@ -35,13 +35,16 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
   const [playbackRate, setPlaybackRate] = useLocalStorage('playbackRate', 1);
 
   const sources = useMemo(() => {
-    const mappedFiles =
-      data?.media_files.map((media) => {
-        return {
-          src: media.url,
-          type: media.content_type,
-        };
-      }) || [];
+    // do not play the original file, it may be large
+    const relevantMediaFiles =
+      data?.media_files.filter((media) => !media.tags.includes('original')) || [];
+
+    const mappedFiles = relevantMediaFiles.map((media) => {
+      return {
+        src: media.url,
+        type: media.content_type,
+      };
+    });
 
     return sortMediaFiles(mappedFiles);
   }, [data?.media_files]);
diff --git a/worker/transcribee_worker/config.py b/worker/transcribee_worker/config.py
index 2a020784..1b2c5e22 100644
--- a/worker/transcribee_worker/config.py
+++ b/worker/transcribee_worker/config.py
@@ -17,10 +17,13 @@ class Settings(BaseSettings):
             "audio_bitrate": "128k",
             "ac": "1",
         },
-        "m4a": {
+        "video:mp4": {
             "format": "mp4",
             "audio_bitrate": "128k",
             "ac": "1",
+            "c:v": "libx264",
+            "crf": "26",
+            "preset": "faster",
         },
     }
 
diff --git a/worker/transcribee_worker/reencode.py b/worker/transcribee_worker/reencode.py
index 1a6632b3..b588adcb 100644
--- a/worker/transcribee_worker/reencode.py
+++ b/worker/transcribee_worker/reencode.py
@@ -10,27 +10,37 @@ def get_duration(input_path: Path):
     return float(ffmpeg.probe(input_path)["format"]["duration"])
 
 
+def has_video(input_path: Path):
+    streams = ffmpeg.probe(input_path)["streams"]
+    for stream in streams:
+        if stream["codec_type"] == "video":
+            return True
+    return False
+
+
 async def reencode(
     input_path: Path,
     output_path: Path,
     output_params: dict[str, str],
     progress_callback: ProgressCallbackType,
     duration: float,
+    include_video: bool,
 ):
     def work(_):
-        cmd: subprocess.Popen = (
-            ffmpeg.input(input_path)
-            .output(
-                filename=output_path,
-                map="0:a",
-                loglevel="quiet",
-                stats=None,
-                progress="-",
-                map_metadata="-1",
-                **output_params
-            )
-            .run_async(pipe_stdout=True)
-        )
+        pipeline = ffmpeg.input(input_path)
+        streams = [pipeline.audio]
+        if include_video and has_video(input_path):
+            streams.append(pipeline.video)
+
+        cmd: subprocess.Popen = ffmpeg.output(
+            *streams,
+            filename=output_path,
+            loglevel="quiet",
+            stats=None,
+            progress="-",
+            map_metadata="-1",
+            **output_params
+        ).run_async(pipe_stdout=True)
         assert cmd.stdout
         raw_line: bytes
         progress_dict = {}
diff --git a/worker/transcribee_worker/worker.py b/worker/transcribee_worker/worker.py
index a8670fac..c07638cf 100644
--- a/worker/transcribee_worker/worker.py
+++ b/worker/transcribee_worker/worker.py
@@ -257,7 +257,7 @@ async def reencode(
 
         n_profiles = len(settings.REENCODE_PROFILES)
         for i, (profile, parameters) in enumerate(settings.REENCODE_PROFILES.items()):
-            output_path = self._get_tmpfile(f"reencode_{profile}")
+            output_path = self._get_tmpfile(f"reencode_{profile.replace(':', '_')}")
 
             await reencode(
                 document_audio,
@@ -269,6 +269,7 @@ async def reencode(
                     **kwargs,
                 ),
                 duration,
+                include_video=(profile.startswith("video:")),
             )
 
             tags = [f"profile:{profile}"] + [f"{k}:{v}" for k, v in parameters.items()]

From 06ba47a6b72e4cd213a61803daf9f876a778a7c8 Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Fri, 8 Dec 2023 16:10:06 +0100
Subject: [PATCH 3/6] =?UTF-8?q?=F0=9F=A7=B1=20Scale=20preview=20video=20to?=
 =?UTF-8?q?=20480p=20and=20keep=20audio=20file?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/src/editor/player.tsx        |  5 ++++-
 worker/transcribee_worker/config.py   |  9 +++++++++
 worker/transcribee_worker/reencode.py | 10 +---------
 worker/transcribee_worker/worker.py   | 27 ++++++++++++++++++++++++++-
 4 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
index a6a59fd0..7cd1555f 100644
--- a/frontend/src/editor/player.tsx
+++ b/frontend/src/editor/player.tsx
@@ -39,7 +39,10 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
     const relevantMediaFiles =
       data?.media_files.filter((media) => !media.tags.includes('original')) || [];
 
-    const mappedFiles = relevantMediaFiles.map((media) => {
+    const videoFiles = relevantMediaFiles.filter((media) => media.tags.includes('video'));
+    const audioFiles = relevantMediaFiles.filter((media) => !media.tags.includes('video'));
+
+    const mappedFiles = [...videoFiles, ...audioFiles].map((media) => {
       return {
         src: media.url,
         type: media.content_type,
diff --git a/worker/transcribee_worker/config.py b/worker/transcribee_worker/config.py
index 1b2c5e22..9dbaec9a 100644
--- a/worker/transcribee_worker/config.py
+++ b/worker/transcribee_worker/config.py
@@ -17,6 +17,11 @@ class Settings(BaseSettings):
             "audio_bitrate": "128k",
             "ac": "1",
         },
+        "m4a": {
+            "format": "mp4",
+            "audio_bitrate": "128k",
+            "ac": "1",
+        },
         "video:mp4": {
             "format": "mp4",
             "audio_bitrate": "128k",
@@ -24,6 +29,10 @@ class Settings(BaseSettings):
             "c:v": "libx264",
             "crf": "26",
             "preset": "faster",
+            # downscale to 480p and pad to multiple of 2 (needed for libx264)
+            "vf": "scale='min(854,iw)':'min(480,ih)'"
+            ":force_original_aspect_ratio=decrease,"
+            "pad='iw+mod(iw\\,2)':'ih+mod(ih\\,2)",
         },
     }
 
diff --git a/worker/transcribee_worker/reencode.py b/worker/transcribee_worker/reencode.py
index b588adcb..bf29fcbe 100644
--- a/worker/transcribee_worker/reencode.py
+++ b/worker/transcribee_worker/reencode.py
@@ -10,14 +10,6 @@ def get_duration(input_path: Path):
     return float(ffmpeg.probe(input_path)["format"]["duration"])
 
 
-def has_video(input_path: Path):
-    streams = ffmpeg.probe(input_path)["streams"]
-    for stream in streams:
-        if stream["codec_type"] == "video":
-            return True
-    return False
-
-
 async def reencode(
     input_path: Path,
     output_path: Path,
@@ -29,7 +21,7 @@ async def reencode(
     def work(_):
         pipeline = ffmpeg.input(input_path)
         streams = [pipeline.audio]
-        if include_video and has_video(input_path):
+        if include_video:
             streams.append(pipeline.video)
 
         cmd: subprocess.Popen = ffmpeg.output(
diff --git a/worker/transcribee_worker/worker.py b/worker/transcribee_worker/worker.py
index c07638cf..f77c6789 100644
--- a/worker/transcribee_worker/worker.py
+++ b/worker/transcribee_worker/worker.py
@@ -9,6 +9,7 @@
 from typing import Any, AsyncGenerator, Optional, Tuple
 
 import automerge
+import ffmpeg
 import numpy.typing as npt
 from pydantic import parse_raw_as
 from transcribee_proto.api import (
@@ -74,6 +75,19 @@ def get_last_atom_end(doc: EditorDocument):
     return 0
 
 
+def media_has_video(path: Path):
+    streams = ffmpeg.probe(path)["streams"]
+    for stream in streams:
+        if stream["codec_type"] == "video":
+            if stream["disposition"]["attached_pic"] != 0:
+                # ignore album covers
+                continue
+
+            return True
+
+    return False
+
+
 class Worker:
     base_url: str
     token: str
@@ -256,9 +270,17 @@ async def reencode(
         self.set_duration(task, duration)
 
         n_profiles = len(settings.REENCODE_PROFILES)
+
+        has_video = media_has_video(document_audio)
+
         for i, (profile, parameters) in enumerate(settings.REENCODE_PROFILES.items()):
             output_path = self._get_tmpfile(f"reencode_{profile.replace(':', '_')}")
 
+            video_profile = profile.startswith("video:")
+
+            if video_profile and not has_video:
+                continue
+
             await reencode(
                 document_audio,
                 output_path,
@@ -269,11 +291,14 @@ async def reencode(
                     **kwargs,
                 ),
                 duration,
-                include_video=(profile.startswith("video:")),
+                include_video=video_profile,
             )
 
             tags = [f"profile:{profile}"] + [f"{k}:{v}" for k, v in parameters.items()]
 
+            if video_profile:
+                tags.append("video")
+
             loop = asyncio.get_running_loop()
             await loop.run_in_executor(
                 None, self.add_document_media_file, task, output_path, tags

From a4f90118b74913f8f855f58a43fabd0d62f5bf88 Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Fri, 8 Dec 2023 16:18:43 +0100
Subject: [PATCH 4/6] =?UTF-8?q?=F0=9F=A7=B9=20Enable=20video=20preview=20b?=
 =?UTF-8?q?ased=20on=20media=20tags?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/src/editor/player.tsx  |  9 ++++++---
 frontend/src/utils/use_audio.ts | 27 +++++++++++++++------------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
index 7cd1555f..5e1a43de 100644
--- a/frontend/src/editor/player.tsx
+++ b/frontend/src/editor/player.tsx
@@ -34,7 +34,7 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
 
   const [playbackRate, setPlaybackRate] = useLocalStorage('playbackRate', 1);
 
-  const sources = useMemo(() => {
+  const { sources, hasVideo } = useMemo(() => {
     // do not play the original file, it may be large
     const relevantMediaFiles =
       data?.media_files.filter((media) => !media.tags.includes('original')) || [];
@@ -49,13 +49,16 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
       };
     });
 
-    return sortMediaFiles(mappedFiles);
+    return {
+      sources: sortMediaFiles(mappedFiles),
+      hasVideo: videoFiles.length > 0,
+    };
   }, [data?.media_files]);
 
   const audio = useAudio({
     playbackRate,
     sources,
-    videoPreview: true,
+    videoPreview: hasVideo,
   });
 
   // calculate the start of the current element to color it
diff --git a/frontend/src/utils/use_audio.ts b/frontend/src/utils/use_audio.ts
index 7fe1ca9c..67410447 100644
--- a/frontend/src/utils/use_audio.ts
+++ b/frontend/src/utils/use_audio.ts
@@ -18,21 +18,24 @@ export function useAudio({ sources, playbackRate, videoPreview }: UseAudioOption
 
   useEffect(() => {
     const myAudioElement = videoPreview ? video([]) : audio([]);
-
     setAudioElement(myAudioElement);
 
+    if (videoPreview) {
+      myAudioElement.style = `
+        position: fixed;
+        bottom: 90px;
+        right: 20px;
+        height: 170px;
+        width: 300px;
+      `;
+    } else {
+      myAudioElement.style = `
+        display: none;
+      `;
+    }
+
     const e = events(myAudioElement);
     e.onDurationChange(() => {
-      if (videoPreview && myAudioElement.videoHeight > 0) {
-        myAudioElement.style = `
-          position: fixed;
-          bottom: 90px;
-          right: 20px;
-          height: 170px;
-          width: 300px;
-        `;
-      }
-
       setDuration(props(myAudioElement).duration);
     });
     e.onPlay(() => setPlayingState(true));
@@ -52,7 +55,7 @@ export function useAudio({ sources, playbackRate, videoPreview }: UseAudioOption
       myAudioElement.innerHTML = '';
       myAudioElement.remove();
     };
-  }, []);
+  }, [videoPreview]);
 
   useEffect(() => {
     if (!audioElement) return;

From b601f3edf47a2e6220f3c8d5a2a69efda0feb38e Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Fri, 8 Dec 2023 16:39:21 +0100
Subject: [PATCH 5/6] =?UTF-8?q?=E2=9C=A8=20Add=20space=20for=20video=20bel?=
 =?UTF-8?q?ow=20document=20content?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 frontend/src/editor/player.tsx               | 16 +++++++++++++++-
 frontend/src/editor/transcription_editor.tsx |  4 +++-
 frontend/src/pages/document.tsx              |  5 +++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
index 5e1a43de..1d7a6097 100644
--- a/frontend/src/editor/player.tsx
+++ b/frontend/src/editor/player.tsx
@@ -23,7 +23,15 @@ const SKIP_SHORTCUT_SEC = 3;
 
 let lastTabPressTs = 0;
 
-export function PlayerBar({ documentId, editor }: { documentId: string; editor: Editor }) {
+export function PlayerBar({
+  documentId,
+  editor,
+  onShowVideo,
+}: {
+  documentId: string;
+  editor: Editor;
+  onShowVideo?: (show: boolean) => void;
+}) {
   const { data } = useGetDocument(
     { document_id: documentId },
     {
@@ -61,6 +69,12 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
     videoPreview: hasVideo,
   });
 
+  useEffect(() => {
+    if (onShowVideo) {
+      onShowVideo(hasVideo);
+    }
+  }, [hasVideo]);
+
   // calculate the start of the current element to color it
   const [currentElementStartTime, setCurrentElementStartTime] = useState(0.0);
 
diff --git a/frontend/src/editor/transcription_editor.tsx b/frontend/src/editor/transcription_editor.tsx
index 46688bb6..c5becb43 100644
--- a/frontend/src/editor/transcription_editor.tsx
+++ b/frontend/src/editor/transcription_editor.tsx
@@ -224,12 +224,14 @@ export function TranscriptionEditor({
   documentId,
   readOnly,
   initialValue,
+  onShowVideo,
   ...props
 }: {
   editor?: Editor;
   documentId: string;
   readOnly: boolean;
   initialValue?: Paragraph[];
+  onShowVideo?: (show: boolean) => void;
 } & ComponentProps<'div'>) {
   const systemPrefersDark = useMediaQuery('(prefers-color-scheme: dark)');
   // prevent ctrl+s
@@ -312,7 +314,7 @@ export function TranscriptionEditor({
                   className={clsx('2xl:-ml-20')}
                 />
               </ErrorBoundary>
-              <PlayerBar documentId={documentId} editor={editor} />
+              <PlayerBar documentId={documentId} editor={editor} onShowVideo={onShowVideo} />
             </LoadingContext.Provider>
           </SpeakerColorsProvider>
         </Slate>
diff --git a/frontend/src/pages/document.tsx b/frontend/src/pages/document.tsx
index 9b6da10a..1ed18558 100644
--- a/frontend/src/pages/document.tsx
+++ b/frontend/src/pages/document.tsx
@@ -112,6 +112,7 @@ export function DocumentPage({
   const [_location, navigate] = useLocation();
   const debugMode = useDebugMode();
   const { isLoggedIn } = useAuthData();
+  const [videoVisible, setVideoVisible] = useState(false);
 
   const url = getDocumentWsUrl(documentId);
 
@@ -188,6 +189,7 @@ export function DocumentPage({
       </TopBar>
 
       <TranscriptionEditor
+        onShowVideo={setVideoVisible}
         editor={editor}
         documentId={documentId}
         initialValue={initialValue}
@@ -195,6 +197,9 @@ export function DocumentPage({
         readOnly={!data || !data.can_write}
       />
 
+      {/* Spacer to prevent video preview from hiding text */}
+      {videoVisible && <div className="h-36"></div>}
+
       {editor && debugMode && <Suspense>{<LazyDebugPanel editor={editor} />}</Suspense>}
     </AppContainer>
   );

From 1e3a74e6ed67740668597a480ee7e845475f8592 Mon Sep 17 00:00:00 2001
From: Philipp Mandler <info@philipp-mandler.com>
Date: Fri, 8 Dec 2023 17:41:12 +0100
Subject: [PATCH 6/6] =?UTF-8?q?=F0=9F=90=9B=20Fix=20reencode=20progress=20?=
 =?UTF-8?q?calculation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 worker/transcribee_worker/worker.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/worker/transcribee_worker/worker.py b/worker/transcribee_worker/worker.py
index f77c6789..1ecefa67 100644
--- a/worker/transcribee_worker/worker.py
+++ b/worker/transcribee_worker/worker.py
@@ -88,6 +88,10 @@ def media_has_video(path: Path):
     return False
 
 
+def is_video_profile(profile_name: str):
+    return profile_name.startswith("video:")
+
+
 class Worker:
     base_url: str
     token: str
@@ -269,17 +273,17 @@ async def reencode(
         duration = get_duration(document_audio)
         self.set_duration(task, duration)
 
-        n_profiles = len(settings.REENCODE_PROFILES)
-
         has_video = media_has_video(document_audio)
-
-        for i, (profile, parameters) in enumerate(settings.REENCODE_PROFILES.items()):
+        applicable_profiles = {
+            profile_name: parameters
+            for profile_name, parameters in settings.REENCODE_PROFILES.items()
+            if has_video or not is_video_profile(profile_name)
+        }
+        n_profiles = len(applicable_profiles)
+
+        for i, (profile, parameters) in enumerate(applicable_profiles.items()):
             output_path = self._get_tmpfile(f"reencode_{profile.replace(':', '_')}")
-
-            video_profile = profile.startswith("video:")
-
-            if video_profile and not has_video:
-                continue
+            video_profile = is_video_profile(profile)
 
             await reencode(
                 document_audio,