diff --git a/frontend/src/editor/player.tsx b/frontend/src/editor/player.tsx index 88a46ca2..1d7a6097 100644 --- a/frontend/src/editor/player.tsx +++ b/frontend/src/editor/player.tsx @@ -23,7 +23,15 @@ const SKIP_SHORTCUT_SEC = 3; let lastTabPressTs = 0; -export function PlayerBar({ documentId, editor }: { documentId: string; editor: Editor }) { +export function PlayerBar({ + documentId, + editor, + onShowVideo, +}: { + documentId: string; + editor: Editor; + onShowVideo?: (show: boolean) => void; +}) { const { data } = useGetDocument( { document_id: documentId }, { @@ -34,23 +42,39 @@ export function PlayerBar({ documentId, editor }: { documentId: string; editor: const [playbackRate, setPlaybackRate] = useLocalStorage('playbackRate', 1); - const sources = useMemo(() => { - const mappedFiles = - data?.media_files.map((media) => { - return { - src: media.url, - type: media.content_type, - }; - }) || []; + const { sources, hasVideo } = useMemo(() => { + // do not play the original file, it may be large + const relevantMediaFiles = + data?.media_files.filter((media) => !media.tags.includes('original')) || []; + + const videoFiles = relevantMediaFiles.filter((media) => media.tags.includes('video')); + const audioFiles = relevantMediaFiles.filter((media) => !media.tags.includes('video')); - return sortMediaFiles(mappedFiles); + const mappedFiles = [...videoFiles, ...audioFiles].map((media) => { + return { + src: media.url, + type: media.content_type, + }; + }); + + return { + sources: sortMediaFiles(mappedFiles), + hasVideo: videoFiles.length > 0, + }; }, [data?.media_files]); const audio = useAudio({ playbackRate, sources, + videoPreview: hasVideo, }); + useEffect(() => { + if (onShowVideo) { + onShowVideo(hasVideo); + } + }, [hasVideo]); + // calculate the start of the current element to color it const [currentElementStartTime, setCurrentElementStartTime] = useState(0.0); diff --git a/frontend/src/editor/transcription_editor.tsx b/frontend/src/editor/transcription_editor.tsx index 46688bb6..c5becb43 100644 --- a/frontend/src/editor/transcription_editor.tsx +++ b/frontend/src/editor/transcription_editor.tsx @@ -224,12 +224,14 @@ export function TranscriptionEditor({ documentId, readOnly, initialValue, + onShowVideo, ...props }: { editor?: Editor; documentId: string; readOnly: boolean; initialValue?: Paragraph[]; + onShowVideo?: (show: boolean) => void; } & ComponentProps<'div'>) { const systemPrefersDark = useMediaQuery('(prefers-color-scheme: dark)'); // prevent ctrl+s @@ -312,7 +314,7 @@ export function TranscriptionEditor({ className={clsx('2xl:-ml-20')} /> - + diff --git a/frontend/src/pages/document.tsx b/frontend/src/pages/document.tsx index 9b6da10a..1ed18558 100644 --- a/frontend/src/pages/document.tsx +++ b/frontend/src/pages/document.tsx @@ -112,6 +112,7 @@ export function DocumentPage({ const [_location, navigate] = useLocation(); const debugMode = useDebugMode(); const { isLoggedIn } = useAuthData(); + const [videoVisible, setVideoVisible] = useState(false); const url = getDocumentWsUrl(documentId); @@ -188,6 +189,7 @@ export function DocumentPage({ + {/* Spacer to prevent video preview from hiding text */} + {videoVisible &&
} + {editor && debugMode && {}} ); diff --git a/frontend/src/utils/use_audio.ts b/frontend/src/utils/use_audio.ts index 6e934b29..67410447 100644 --- a/frontend/src/utils/use_audio.ts +++ b/frontend/src/utils/use_audio.ts @@ -1,12 +1,13 @@ -import { actions, audio, events, props } from '@podlove/html5-audio-driver'; +import { actions, video, events, props, audio } from '@podlove/html5-audio-driver'; import { useCallback, useEffect, useRef, useState } from 'react'; type UseAudioOptions = { playbackRate?: number; sources: Array<{ src: string; type: string }>; + videoPreview?: boolean; }; -export function useAudio({ sources, playbackRate }: UseAudioOptions) { +export function useAudio({ sources, playbackRate, videoPreview }: UseAudioOptions) { const [playing, setPlayingState] = useState(false); const [duration, setDuration] = useState(); const [buffering, setBuffering] = useState(false); @@ -16,9 +17,23 @@ export function useAudio({ sources, playbackRate }: UseAudioOptions) { const [audioElement, setAudioElement] = useState(null); useEffect(() => { - const myAudioElement = audio([]); + const myAudioElement = videoPreview ? video([]) : audio([]); setAudioElement(myAudioElement); + if (videoPreview) { + myAudioElement.style = ` + position: fixed; + bottom: 90px; + right: 20px; + height: 170px; + width: 300px; + `; + } else { + myAudioElement.style = ` + display: none; + `; + } + const e = events(myAudioElement); e.onDurationChange(() => { setDuration(props(myAudioElement).duration); @@ -40,7 +55,7 @@ export function useAudio({ sources, playbackRate }: UseAudioOptions) { myAudioElement.innerHTML = ''; myAudioElement.remove(); }; - }, []); + }, [videoPreview]); useEffect(() => { if (!audioElement) return; diff --git a/worker/transcribee_worker/config.py b/worker/transcribee_worker/config.py index 2a020784..9dbaec9a 100644 --- a/worker/transcribee_worker/config.py +++ b/worker/transcribee_worker/config.py @@ -22,6 +22,18 @@ class Settings(BaseSettings): "audio_bitrate": "128k", "ac": "1", }, + "video:mp4": { + "format": "mp4", + "audio_bitrate": "128k", + "ac": "1", + "c:v": "libx264", + "crf": "26", + "preset": "faster", + # downscale to 480p and pad to multiple of 2 (needed for libx264) + "vf": "scale='min(854,iw)':'min(480,ih)'" + ":force_original_aspect_ratio=decrease," + "pad='iw+mod(iw\\,2)':'ih+mod(ih\\,2)", + }, } KEEPALIVE_INTERVAL: float = 0.5 diff --git a/worker/transcribee_worker/reencode.py b/worker/transcribee_worker/reencode.py index 1a6632b3..bf29fcbe 100644 --- a/worker/transcribee_worker/reencode.py +++ b/worker/transcribee_worker/reencode.py @@ -16,21 +16,23 @@ async def reencode( output_params: dict[str, str], progress_callback: ProgressCallbackType, duration: float, + include_video: bool, ): def work(_): - cmd: subprocess.Popen = ( - ffmpeg.input(input_path) - .output( - filename=output_path, - map="0:a", - loglevel="quiet", - stats=None, - progress="-", - map_metadata="-1", - **output_params - ) - .run_async(pipe_stdout=True) - ) + pipeline = ffmpeg.input(input_path) + streams = [pipeline.audio] + if include_video: + streams.append(pipeline.video) + + cmd: subprocess.Popen = ffmpeg.output( + *streams, + filename=output_path, + loglevel="quiet", + stats=None, + progress="-", + map_metadata="-1", + **output_params + ).run_async(pipe_stdout=True) assert cmd.stdout raw_line: bytes progress_dict = {} diff --git a/worker/transcribee_worker/worker.py b/worker/transcribee_worker/worker.py index a8670fac..1ecefa67 100644 --- a/worker/transcribee_worker/worker.py +++ b/worker/transcribee_worker/worker.py @@ -9,6 +9,7 @@ from typing import Any, AsyncGenerator, Optional, Tuple import automerge +import ffmpeg import numpy.typing as npt from pydantic import parse_raw_as from transcribee_proto.api import ( @@ -74,6 +75,23 @@ def get_last_atom_end(doc: EditorDocument): return 0 +def media_has_video(path: Path): + streams = ffmpeg.probe(path)["streams"] + for stream in streams: + if stream["codec_type"] == "video": + if stream["disposition"]["attached_pic"] != 0: + # ignore album covers + continue + + return True + + return False + + +def is_video_profile(profile_name: str): + return profile_name.startswith("video:") + + class Worker: base_url: str token: str @@ -255,9 +273,17 @@ async def reencode( duration = get_duration(document_audio) self.set_duration(task, duration) - n_profiles = len(settings.REENCODE_PROFILES) - for i, (profile, parameters) in enumerate(settings.REENCODE_PROFILES.items()): - output_path = self._get_tmpfile(f"reencode_{profile}") + has_video = media_has_video(document_audio) + applicable_profiles = { + profile_name: parameters + for profile_name, parameters in settings.REENCODE_PROFILES.items() + if has_video or not is_video_profile(profile_name) + } + n_profiles = len(applicable_profiles) + + for i, (profile, parameters) in enumerate(applicable_profiles.items()): + output_path = self._get_tmpfile(f"reencode_{profile.replace(':', '_')}") + video_profile = is_video_profile(profile) await reencode( document_audio, @@ -269,10 +295,14 @@ async def reencode( **kwargs, ), duration, + include_video=video_profile, ) tags = [f"profile:{profile}"] + [f"{k}:{v}" for k, v in parameters.items()] + if video_profile: + tags.append("video") + loop = asyncio.get_running_loop() await loop.run_in_executor( None, self.add_document_media_file, task, output_path, tags