Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrate whole process #16

Merged
merged 1 commit into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ python3 main.py

```bash
curl -X POST -H "Content-Type: application/json" \
-d '{"date": "2024-12-01"}' \
-d '{"date": "2024-12-03"}' \
http://localhost:8000/start
```

Expand Down
1 change: 1 addition & 0 deletions SmartLegiCrawler/app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ async def get_meetings():
q = request.args.get('q', None)
committee = request.args.get('committee', None)
limit = request.args.get('limit', default=100, type=int)
print(f"start_date: {start_date}, end_date: {end_date}, page: {page}, q: {q}, committee: {committee}, limit: {limit}")

try:
meetings, has_more, current_page = await scrape_meetings(start_date, end_date, page, q, committee, limit)
Expand Down
3 changes: 1 addition & 2 deletions SmartLegiCrawler/app/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def parse_video_element(video_element):

return video_info

async def scrape_video_links(meeting_id):
browser = await init_driver()
async def scrape_video_links(browser, meeting_id):
"""爬取會議視頻鏈接"""
base_url = 'https://ivod.ly.gov.tw'
meeting_url = f'http://ivod.ly.gov.tw/Demand/Meetvod?Meet={meeting_id}'
Expand Down
41 changes: 24 additions & 17 deletions app/crawler.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,37 @@
# app/crawler.py
import requests
import aiohttp
from datetime import datetime
from app.utils import setup_logger, get_path

logger = setup_logger('crawler', 'logs/crawler.log')

BASE_URL = "http://localhost:5000" # SmartLegiCrawler API 地址

def fetch_meetings(date):
async def fetch_meetings(date):
"""根據日期爬取會議列表"""
if not date:
date = datetime.now().strftime("%Y-%m-%d")
response = requests.get(f"{BASE_URL}/api/meetings?start_date={date}&end_date={date}")
if response.status_code == 200:
meetings = response.json().get('meetings', [])
logger.info(f"Fetched {len(meetings)} meetings for date: {date}")
return meetings
else:
logger.error(f"Failed to fetch meetings for date: {date}")
return []
print(f"date: {date}")
async with aiohttp.ClientSession() as session:
print(f"{BASE_URL}/api/meetings?start_date={date}&end_date={date}")
async with session.get(f"{BASE_URL}/api/meetings?start_date={date}&end_date={date}") as response:
if response.status == 200:
meetings = await response.json()
logger.info(f"Fetched {len(meetings['meetings'])} meetings for date: {date}")
return meetings['meetings']
else:
logger.error(f"Failed to fetch meetings for date: {date}")
return []

def download_video(video_url, video_id):
"""下載視頻並保存到 shared_data/videos"""
async def download_video(video_url, video_id):
"""非同步下載視頻並保存到 shared_data/videos"""
video_path = get_path("shared_data/videos", f"{video_id}.mp4")
response = requests.post(f"{BASE_URL}/api/download", json={"url": video_url})
if response.status_code == 200:
logger.info(f"Downloaded video: {video_url} -> {video_path}")
else:
logger.error(f"Failed to download video: {video_url}")
async with aiohttp.ClientSession() as session:
async with session.post(f"{BASE_URL}/api/download", json={"url": video_url}) as response:
if response.status == 200:
video_content = await response.read()
with open(video_path, 'wb') as f:
f.write(video_content)
logger.info(f"Downloaded video: {video_url} -> {video_path}")
else:
logger.error(f"Failed to download video: {video_url}")
28 changes: 15 additions & 13 deletions app/generator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# app/generator.py
import os
import requests
import aiohttp
from app.utils import setup_logger, get_path

logger = setup_logger('generator', 'logs/generator.log')

BASE_URL = "http://localhost:5003" # Vocal & Video API 地址

def generate_vocal(video_id):
async def generate_vocal(video_id):
"""基於摘要生成配音"""
summary_path = get_path("shared_data/summaries", f"{video_id}.txt")
audio_path = get_path("shared_data/audios", f"{video_id}.wav")
Expand All @@ -16,13 +16,14 @@ def generate_vocal(video_id):
logger.error(f"Summary file not found: {summary_path}")
return

response = requests.post(f"{BASE_URL}/generate-vocal", json={"text": summary_path})
if response.status_code == 200:
logger.info(f"Generated vocal: {video_id} -> {audio_path}")
else:
logger.error(f"Failed to generate vocal: {video_id}")
async with aiohttp.ClientSession() as session:
async with session.post(f"{BASE_URL}/generate-vocal", json={"text": summary_path}) as response:
if response.status == 200:
logger.info(f"Generated vocal: {video_id} -> {audio_path}")
else:
logger.error(f"Failed to generate vocal: {video_id}")

def generate_video(video_id):
async def generate_video(video_id):
"""基於配音生成新聞視頻"""
audio_path = get_path("shared_data/audios", f"{video_id}.wav")
video_output_path = get_path("shared_data/videos", f"news_{video_id}.mp4")
Expand All @@ -31,8 +32,9 @@ def generate_video(video_id):
logger.error(f"Audio file not found: {audio_path}")
return

response = requests.post(f"{BASE_URL}/generate-video", json={"audio_path": audio_path})
if response.status_code == 200:
logger.info(f"Generated video: {video_id} -> {video_output_path}")
else:
logger.error(f"Failed to generate video: {video_id}")
async with aiohttp.ClientSession() as session:
async with session.post(f"{BASE_URL}/generate-video", json={"audio_path": audio_path}) as response:
if response.status == 200:
logger.info(f"Generated video: {video_id} -> {video_output_path}")
else:
logger.error(f"Failed to generate video: {video_id}")
147 changes: 110 additions & 37 deletions app/progress_checker.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,124 @@
# app/progress_checker.py

import asyncio
from datetime import datetime, timedelta
from app.crawler import fetch_meetings, download_video
from app.transcriber import transcribe_video
from app.summarizer import summarize_transcript
from app.generator import generate_vocal, generate_video
from app.utils import load_progress, save_progress

def process_videos(date=None):
"""處理影片的完整流程,從指定日期或上一次進度開始"""

async def process_videos(date=None):
"""處理影片的完整流程,會從上次進度接續處理,或處理指定日期的影片"""
progress = load_progress()

# 檢查是否有指定日期,優先處理新日期
if date:
progress["date"] = date
save_progress("date", {"date": date})

date_to_process = progress.get("date")
if not date_to_process:
raise ValueError("未指定影片日期且無上一次進度記錄。")
# **步驟 1: 處理日期**
if "date" not in progress:
date_to_process = date or (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
progress["date"] = date_to_process
save_progress("date", date_to_process)
else:
date_to_process = progress["date"]
print(f"處理日期:{date_to_process}")

# 爬取會議列表
meetings = fetch_meetings(date_to_process)
save_progress("meetings", {"meetings": meetings})
# **步驟 2: 取得會議列表**
if "meetings" not in progress or not progress["meetings"]:
print("正在爬取會議列表...")
meetings = await fetch_meetings(date_to_process)
progress["meetings"] = meetings
save_progress("meetings", meetings)
else:
meetings = progress["meetings"]

# 下載視頻
# **步驟 3: 處理每個會議的影片**
for meeting in meetings:
for video in meeting.get("videos", []):
download_video(video["url"])
save_progress("download", {"video_id": video["video_id"]})
meeting_id = meeting.get("meeting_id")
videos = meeting.get("video_links", {}).get("video_links", [])

# 字幕轉換
for meeting in meetings:
for video in meeting.get("videos", []):
transcribe_video(video["url"])
save_progress("transcribe", {"video_id": video["video_id"]})
# 過濾並排序 `clip` 類型的影片
clip_videos = sorted(
[video for video in videos if video.get("type") == "clip"],
key=lambda x: int(x["video_links"]["窄頻"].split("/")[-1])
)

# 摘要生成
for meeting in meetings:
for video in meeting.get("videos", []):
summarize_transcript(video["url"])
save_progress("summarize", {"video_id": video["video_id"]})
# 初始化會議的進度
if "results" not in progress:
progress["results"] = {}
if meeting_id not in progress["results"]:
progress["results"][meeting_id] = {}

# 配音與視頻生成
for meeting in meetings:
for video in meeting.get("videos", []):
transcript = f"transcripts/{video['video_id']}.json"
summary = f"summaries/{video['video_id']}.txt"
generate_vocal(summary)
generate_video(summary)
save_progress("generate", {"video_id": video["video_id"]})
for video in clip_videos:
video_id = video["video_links"]["窄頻"].split("/")[-1]
video_url = video["video_links"]["窄頻"]

# 初始化影片的進度
if video_id not in progress["results"][meeting_id]:
progress["results"][meeting_id][video_id] = {
"status": "pending",
"steps": {
"download": False,
"transcribe": False,
"summarize": False,
"generate_audio": False,
"generate_video": False
},
"audio": None,
"video": None,
"summary": None,
"error": None
}

video_progress = progress["results"][meeting_id][video_id]

try:
# **下載影片**
if not video_progress["steps"]["download"]:
print(f"下載影片:{video_url}")
try:
await download_video(video_url, video_id)
video_progress["steps"]["download"] = True
save_progress("results", progress["results"])
except Exception as e:
video_progress["error"] = f"下載失敗:{str(e)}"
save_progress("results", progress["results"])
print(f"下載影片 {video_id} 時發生錯誤:{e}")


# **轉錄字幕**
if not video_progress["steps"]["transcribe"] and video_progress["steps"]["download"]:
print(f"轉錄字幕:{video_id}")
await transcribe_video(video_id)
video_progress["steps"]["transcribe"] = True
save_progress("results", progress["results"])

# **生成摘要**
if not video_progress["steps"]["summarize"] and video_progress["steps"]["transcribe"]:
print(f"生成摘要:{video_id}")
summary_path = await summarize_transcript(video_id)
video_progress["steps"]["summarize"] = True
video_progress["summary"] = summary_path
save_progress("results", progress["results"])

# **生成配音**
if not video_progress["steps"]["generate_audio"] and video_progress["steps"]["summarize"]:
print(f"生成配音:{video_id}")
audio_path = await generate_vocal(video_id)
video_progress["steps"]["generate_audio"] = True
video_progress["audio"] = audio_path
save_progress("results", progress["results"])

# **生成影片**
if not video_progress["steps"]["generate_video"] and video_progress["steps"]["summarize"]:
print(f"生成影片:{video_id}")
video_path = await generate_video(video_id)
video_progress["steps"]["generate_video"] = True
video_progress["video"] = video_path
video_progress["status"] = "completed"
save_progress("results", progress["results"])

# Todo: 把同一個meeting裡面所有的video拼成一個

except Exception as e:
# 記錄錯誤並跳過
video_progress["error"] = str(e)
save_progress("results", progress["results"])
print(f"處理影片 {video_id} 時發生錯誤:{e}")
18 changes: 11 additions & 7 deletions app/summarizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# app/summarizer.py
import os
import requests
import aiohttp
from app.utils import setup_logger, get_path

logger = setup_logger('summarizer', 'logs/summarizer.log')

BASE_URL = "http://localhost:5002" # EventSummarizer API 地址

def summarize_transcript(video_id):
async def summarize_transcript(video_id):
"""生成摘要,並保存到 shared_data/summaries"""
transcript_path = get_path("shared_data/transcripts", f"{video_id}.json")
summary_path = get_path("shared_data/summaries", f"{video_id}.txt")
Expand All @@ -16,8 +16,12 @@ def summarize_transcript(video_id):
logger.error(f"Transcript file not found: {transcript_path}")
return

response = requests.get(f"{BASE_URL}/api/summarize", json={"url": transcript_path})
if response.status_code == 200:
logger.info(f"Summarized transcript: {video_id} -> {summary_path}")
else:
logger.error(f"Failed to summarize transcript: {video_id}")
async with aiohttp.ClientSession() as session:
async with session.get(f"{BASE_URL}/api/summarize", json={"url": transcript_path}) as response:
if response.status == 200:
summary = await response.text()
with open(summary_path, 'w') as f:
f.write(summary)
logger.info(f"Generated summary for video: {video_id}")
else:
logger.error(f"Failed to generate summary for video: {video_id}")
19 changes: 12 additions & 7 deletions app/transcriber.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# app/transcriber.py
import os
import requests
import aiohttp
from app.utils import setup_logger, get_path
import json

logger = setup_logger('transcriber', 'logs/transcriber.log')

BASE_URL = "http://localhost:5001" # VideoScript API 地址

def transcribe_video(video_id):
async def transcribe_video(video_id):
"""將視頻轉換為字幕,並保存到 shared_data/transcripts"""
video_path = get_path("shared_data/videos", f"{video_id}.mp4")
transcript_path = get_path("shared_data/transcripts", f"{video_id}.json")
Expand All @@ -16,8 +17,12 @@ def transcribe_video(video_id):
logger.error(f"Video file not found: {video_path}")
return

response = requests.post(f"{BASE_URL}/api/transcribe", json={"url": video_path})
if response.status_code == 200:
logger.info(f"Transcribed video: {video_id} -> {transcript_path}")
else:
logger.error(f"Failed to transcribe video: {video_id}")
async with aiohttp.ClientSession() as session:
async with session.post(f"{BASE_URL}/api/transcribe", json={"url": video_path}) as response:
if response.status == 200:
transcript = await response.json()
with open(transcript_path, 'w') as f:
json.dump(transcript, f)
logger.info(f"Transcribed video: {video_id} -> {transcript_path}")
else:
logger.error(f"Failed to transcribe video: {video_id}")
Loading