-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcanvas-scraper.py
133 lines (115 loc) · 4.87 KB
/
canvas-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!bin/python3
import argparse
import os
import re
from pathvalidate import sanitize_filename
from canvasapi import Canvas
from canvasapi.course import Course
from canvasapi.exceptions import Unauthorized, ResourceDoesNotExist, Forbidden
from canvasapi.file import File
from canvasapi.module import Module, ModuleItem
def extract_files(text):
text_search = re.findall("/files/(\\d+)", text, re.IGNORECASE)
groups = set(text_search)
return groups
def get_course_files(course):
modules = course.get_modules()
files_downloaded = set() # Track downloaded files for this course to avoid duplicates
for module in modules:
module: Module = module
module_items = module.get_module_items()
for item in module_items:
item: ModuleItem = item
try:
path = f"{output}/" \
f"{sanitize_filename(course.name)}/" \
f"{sanitize_filename(module.name)}/"
except Exception as e:
print(e)
continue
if not os.path.exists(path):
os.makedirs(path)
item_type = item.type
print(f"{course.name} - "
f"{module.name} - "
f"{item.title} ({item_type})")
if item_type == "File":
file = canvas.get_file(item.content_id)
files_downloaded.add(item.content_id)
file.download(path + sanitize_filename(file.filename))
elif item_type == "Page":
page = course.get_page(item.page_url)
with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f:
f.write(page.body or "")
files = extract_files(page.body or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.filename))
except ResourceDoesNotExist:
pass
elif item_type == "ExternalUrl":
url = item.external_url
with open(path + sanitize_filename(item.title) + ".url", "w") as f:
f.write("[InternetShortcut]\n")
f.write("URL=" + url)
elif item_type == "Assignment":
assignment = course.get_assignment(item.content_id)
with open(path + sanitize_filename(item.title) + ".html", "w", encoding="utf-8") as f:
f.write(assignment.description or "")
files = extract_files(assignment.description or "")
for file_id in files:
if file_id in files_downloaded:
continue
try:
file = course.get_file(file_id)
files_downloaded.add(file_id)
file.download(path + sanitize_filename(file.filename))
except ResourceDoesNotExist:
pass
except Unauthorized:
pass
except Forbidden:
pass
try:
files = course.get_files()
for file in files:
file: File = file
if not file.id in files_downloaded:
print(f"{course.name} - {file.filename}")
path = f"{output}/{sanitize_filename(course.name)}/" \
f"{sanitize_filename(file.filename)}"
file.download(path)
except Unauthorized:
pass
except Forbidden:
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download all content from Canvas")
parser.add_argument("url", help="URL to the Canvas website, e.g. https://canvas.utwente.nl")
parser.add_argument("token", help="Token generated in the settings page on Canvas")
parser.add_argument("output", help="Path to the output folder, e.g. output/")
parser.add_argument("courses", help="Comma-separated course IDs or 'all'", nargs="?", const="all")
args = parser.parse_args()
# Handle args
output = args.output.rstrip("/") + "/"
if args.courses is None:
args.courses = "all"
print("No courses specified. Scraping all courses.")
canvas = Canvas(args.url, args.token)
courses = [] # courses to scrape
# Select courses to scrape, default to all
if args.courses != "all":
courses = []
ids = args.courses.split(",")
for id in ids:
courses.append(canvas.get_course( int(id) ))
else:
courses = canvas.get_courses()
# Perform scrape
for course in courses:
course: Course = course
get_course_files(course)