bugbakery · phlmn · Dec 10, 2023 · Dec 6, 2023 · Dec 7, 2023 · Dec 8, 2023
diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx
@@ -23,7 +23,15 @@ const SKIP_SHORTCUT_SEC = 3;
 
 let lastTabPressTs = 0;
 
-export function PlayerBar({ documentId, editor }: { documentId: string; editor: Editor }) {
+export function PlayerBar({
+  documentId,
+  editor,
+  onShowVideo,
+}: {
+  documentId: string;
+  editor: Editor;
+  onShowVideo?: (show: boolean) => void;
+}) {
   const { data } = useGetDocument(
     { document_id: documentId },
     {
@@ -34,23 +42,39 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor:
 
   const [playbackRate, setPlaybackRate] = useLocalStorage('playbackRate', 1);
 
-  const sources = useMemo(() => {
-    const mappedFiles =
-      data?.media_files.map((media) => {
-        return {
-          src: media.url,
-          type: media.content_type,
-        };
-      }) || [];
+  const { sources, hasVideo } = useMemo(() => {
+    // do not play the original file, it may be large
+    const relevantMediaFiles =
+      data?.media_files.filter((media) => !media.tags.includes('original')) || [];
+
+    const videoFiles = relevantMediaFiles.filter((media) => media.tags.includes('video'));
+    const audioFiles = relevantMediaFiles.filter((media) => !media.tags.includes('video'));
 
-    return sortMediaFiles(mappedFiles);
+    const mappedFiles = [...videoFiles, ...audioFiles].map((media) => {
+      return {
+        src: media.url,
+        type: media.content_type,
+      };
+    });
+
+    return {
+      sources: sortMediaFiles(mappedFiles),
+      hasVideo: videoFiles.length > 0,
+    };
   }, [data?.media_files]);
 
   const audio = useAudio({
     playbackRate,
     sources,
+    videoPreview: hasVideo,
   });
 
+  useEffect(() => {
+    if (onShowVideo) {
+      onShowVideo(hasVideo);
+    }
+  }, [hasVideo]);
+
   // calculate the start of the current element to color it
   const [currentElementStartTime, setCurrentElementStartTime] = useState(0.0);
 

diff --git a/frontend/src/editor/transcription_editor.tsx b/frontend/src/editor/transcription_editor.tsx
@@ -224,12 +224,14 @@ export function TranscriptionEditor({
   documentId,
   readOnly,
   initialValue,
+  onShowVideo,
   ...props
 }: {
   editor?: Editor;
   documentId: string;
   readOnly: boolean;
   initialValue?: Paragraph[];
+  onShowVideo?: (show: boolean) => void;
 } & ComponentProps<'div'>) {
   const systemPrefersDark = useMediaQuery('(prefers-color-scheme: dark)');
   // prevent ctrl+s
@@ -312,7 +314,7 @@ export function TranscriptionEditor({
                   className={clsx('2xl:-ml-20')}
                 />
               </ErrorBoundary>
-              <PlayerBar documentId={documentId} editor={editor} />
+              <PlayerBar documentId={documentId} editor={editor} onShowVideo={onShowVideo} />
             </LoadingContext.Provider>
           </SpeakerColorsProvider>
         </Slate>

diff --git a/frontend/src/pages/document.tsx b/frontend/src/pages/document.tsx
@@ -112,6 +112,7 @@ export function DocumentPage({
   const [_location, navigate] = useLocation();
   const debugMode = useDebugMode();
   const { isLoggedIn } = useAuthData();
+  const [videoVisible, setVideoVisible] = useState(false);
 
   const url = getDocumentWsUrl(documentId);
 
@@ -188,13 +189,17 @@ export function DocumentPage({
       </TopBar>
 
       <TranscriptionEditor
+        onShowVideo={setVideoVisible}
         editor={editor}
         documentId={documentId}
         initialValue={initialValue}
         className={'grow flex flex-col'}
         readOnly={!data || !data.can_write}
       />
 
+      {/* Spacer to prevent video preview from hiding text */}
+      {videoVisible && <div className="h-36"></div>}
+
       {editor && debugMode && <Suspense>{<LazyDebugPanel editor={editor} />}</Suspense>}
     </AppContainer>
   );

diff --git a/frontend/src/utils/use_audio.ts b/frontend/src/utils/use_audio.ts
@@ -1,12 +1,13 @@
-import { actions, audio, events, props } from '@podlove/html5-audio-driver';
+import { actions, video, events, props, audio } from '@podlove/html5-audio-driver';
 import { useCallback, useEffect, useRef, useState } from 'react';
 
 type UseAudioOptions = {
   playbackRate?: number;
   sources: Array<{ src: string; type: string }>;
+  videoPreview?: boolean;
 };
 
-export function useAudio({ sources, playbackRate }: UseAudioOptions) {
+export function useAudio({ sources, playbackRate, videoPreview }: UseAudioOptions) {
   const [playing, setPlayingState] = useState(false);
   const [duration, setDuration] = useState<number | undefined>();
   const [buffering, setBuffering] = useState(false);
@@ -16,9 +17,23 @@ export function useAudio({ sources, playbackRate }: UseAudioOptions) {
   const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
 
   useEffect(() => {
-    const myAudioElement = audio([]);
+    const myAudioElement = videoPreview ? video([]) : audio([]);
     setAudioElement(myAudioElement);
 
+    if (videoPreview) {
+      myAudioElement.style = `
+        position: fixed;
+        bottom: 90px;
+        right: 20px;
+        height: 170px;
+        width: 300px;
+      `;
+    } else {
+      myAudioElement.style = `
+        display: none;
+      `;
+    }
+
     const e = events(myAudioElement);
     e.onDurationChange(() => {
       setDuration(props(myAudioElement).duration);
@@ -40,7 +55,7 @@ export function useAudio({ sources, playbackRate }: UseAudioOptions) {
       myAudioElement.innerHTML = '';
       myAudioElement.remove();
     };
-  }, []);
+  }, [videoPreview]);
 
   useEffect(() => {
     if (!audioElement) return;

diff --git a/worker/transcribee_worker/config.py b/worker/transcribee_worker/config.py
@@ -22,6 +22,18 @@ class Settings(BaseSettings):
             "audio_bitrate": "128k",
             "ac": "1",
         },
+        "video:mp4": {
+            "format": "mp4",
+            "audio_bitrate": "128k",
+            "ac": "1",
+            "c:v": "libx264",
+            "crf": "26",
+            "preset": "faster",
+            # downscale to 480p and pad to multiple of 2 (needed for libx264)
+            "vf": "scale='min(854,iw)':'min(480,ih)'"
+            ":force_original_aspect_ratio=decrease,"
+            "pad='iw+mod(iw\\,2)':'ih+mod(ih\\,2)",
+        },
     }
 
     KEEPALIVE_INTERVAL: float = 0.5

diff --git a/worker/transcribee_worker/reencode.py b/worker/transcribee_worker/reencode.py
@@ -16,21 +16,23 @@ async def reencode(
     output_params: dict[str, str],
     progress_callback: ProgressCallbackType,
     duration: float,
+    include_video: bool,
 ):
     def work(_):
-        cmd: subprocess.Popen = (
-            ffmpeg.input(input_path)
-            .output(
-                filename=output_path,
-                map="0:a",
-                loglevel="quiet",
-                stats=None,
-                progress="-",
-                map_metadata="-1",
-                **output_params
-            )
-            .run_async(pipe_stdout=True)
-        )
+        pipeline = ffmpeg.input(input_path)
+        streams = [pipeline.audio]
+        if include_video:
+            streams.append(pipeline.video)
+
+        cmd: subprocess.Popen = ffmpeg.output(
+            *streams,
+            filename=output_path,
+            loglevel="quiet",
+            stats=None,
+            progress="-",
+            map_metadata="-1",
+            **output_params
+        ).run_async(pipe_stdout=True)
         assert cmd.stdout
         raw_line: bytes
         progress_dict = {}

diff --git a/worker/transcribee_worker/worker.py b/worker/transcribee_worker/worker.py
@@ -9,6 +9,7 @@
 from typing import Any, AsyncGenerator, Optional, Tuple
 
 import automerge
+import ffmpeg
 import numpy.typing as npt
 from pydantic import parse_raw_as
 from transcribee_proto.api import (
@@ -74,6 +75,23 @@ def get_last_atom_end(doc: EditorDocument):
     return 0
 
 
+def media_has_video(path: Path):
+    streams = ffmpeg.probe(path)["streams"]
+    for stream in streams:
+        if stream["codec_type"] == "video":
+            if stream["disposition"]["attached_pic"] != 0:
+                # ignore album covers
+                continue
+
+            return True
+
+    return False
+
+
+def is_video_profile(profile_name: str):
+    return profile_name.startswith("video:")
+
+
 class Worker:
     base_url: str
     token: str
@@ -255,9 +273,17 @@ async def reencode(
         duration = get_duration(document_audio)
         self.set_duration(task, duration)
 
-        n_profiles = len(settings.REENCODE_PROFILES)
-        for i, (profile, parameters) in enumerate(settings.REENCODE_PROFILES.items()):
-            output_path = self._get_tmpfile(f"reencode_{profile}")
+        has_video = media_has_video(document_audio)
+        applicable_profiles = {
+            profile_name: parameters
+            for profile_name, parameters in settings.REENCODE_PROFILES.items()
+            if has_video or not is_video_profile(profile_name)
+        }
+        n_profiles = len(applicable_profiles)
+
+        for i, (profile, parameters) in enumerate(applicable_profiles.items()):
+            output_path = self._get_tmpfile(f"reencode_{profile.replace(':', '_')}")
+            video_profile = is_video_profile(profile)
 
             await reencode(
                 document_audio,
@@ -269,10 +295,14 @@ async def reencode(
                     **kwargs,
                 ),
                 duration,
+                include_video=video_profile,
             )
 
             tags = [f"profile:{profile}"] + [f"{k}:{v}" for k, v in parameters.items()]
 
+            if video_profile:
+                tags.append("video")
+
             loop = asyncio.get_running_loop()
             await loop.run_in_executor(
                 None, self.add_document_media_file, task, output_path, tags