-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
87 lines (71 loc) · 2.77 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import yaml
import argparse
import os
import json
from tqdm import tqdm
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from pytube import YouTube
from pytube.cli import on_progress
from youtube_transcript_api import YouTubeTranscriptApi
def get_args():
parser = argparse.ArgumentParser("Youtube Video Scraper")
parser.add_argument("--query", type=str,
default="", help="Search query.")
parser.add_argument("--credentials_file", type=str,
default="credentials.yaml", help="Path to the credentials file.")
parser.add_argument("--dest_folder", type=str,
default="videos/", help="Path to the output folder.")
parser.add_argument("--max_results", type=int,
default=2, help="Maximum number of results.")
parser.add_argument("--download_captions", type=bool,
default=True, help="Download captions, if any, for the videos.")
args = parser.parse_args()
return args
def get_creds(args):
with open(args.credentials_file, "r") as file:
creds = yaml.safe_load(file)
api_key = creds["api_key"]
return api_key
def get_video_listings(api_key, args):
try:
query = args.query
max_results = args.max_results
youtube = build('youtube', 'v3', developerKey=api_key)
search_response = youtube.search().list(
part='snippet',
type='video',
q=query,
maxResults=max_results
).execute()
print(yaml.dump(search_response["items"]))
video_ids = list(
map(lambda x: x["id"]["videoId"], search_response["items"])
)
return video_ids
except HttpError as e:
print(f'An HTTP error {e.resp.status} occurred: {e.content}')
return []
def form_youtube_url(video_id):
return "http://youtu.be/" + video_id
def download_video(video_id, args):
dest_folder = args.dest_folder
video_url = form_youtube_url(video_id)
yt = YouTube(video_url, on_progress_callback=on_progress)
yt = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
'resolution').desc().first()
if not os.path.exists(dest_folder):
os.makedirs(dest_folder)
yt.download(dest_folder, filename=f'{video_id}.mp4')
def download_captions(video_id, args):
dest_folder = args.dest_folder
srt = YouTubeTranscriptApi.get_transcript(video_id)
with open(f'{dest_folder}/{video_id}.json', 'w') as fp:
json.dump(srt, fp)
if __name__ == '__main__':
args = get_args()
api_key = get_creds(args)
video_ids = get_video_listings(api_key, args)
for video_id in tqdm(video_ids):
download_video(video_id, args)
download_captions(video_id, args)