-
Notifications
You must be signed in to change notification settings - Fork 10
/
fetchdists.py
129 lines (109 loc) · 4.22 KB
/
fetchdists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# This program tries to parse distrowatch and create a svg graph simliar to: <https://en.wikipedia.org/wiki/Linux_distribution#/media/File:Linux_Distribution_Timeline_with_Android.svg>
# Copyright (C) 2016 Jappe Klooster
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.If not, see <http://www.gnu.org/licenses/>.
"""
This file does the data collection from distrowatch (or any other site with
a similar html structure)
"""
import strings
from requests import Session
from bs4 import BeautifulSoup
from re import match
from shutil import copyfileobj
def jsondumps(item):
import json
return json.dumps(item, indent=4)
def fetch_details(arguments):
"""
Fetch the details of a distrobution, can be executed as a seperate
process, IO bound and blocking.
"""
(baseurl, distrobution) = arguments
# since this is subprocess space we want to reconstruct these based on
# the passed primitives
distrobution = BeautifulSoup(distrobution)
session = Session()
print("downloading and parsing %s" % distrobution.a.text)
aname = distrobution.a.get("href")
hname = distrobution.a.text
aname = hname.split(' ')[0].lower() if aname == '' else aname
link = "%s/%s" % (baseurl, aname)
distrosoup = BeautifulSoup(session.get(link).text)
structure = {
strings.name: aname,
"Human Name": hname,
"Link": link
}
anchor = distrosoup.find('ul')
for attribute in anchor.find_all('li'):
if attribute.b is None:
# no name, probably not a distro
continue
name = attribute.b.extract().text[:-1]
structure[name] = attribute.text[1:].replace("\\n", "")
def sanatizeDate(element):
"""find all dates and do some data sanitation if neccisarry"""
date = element.text
if "-" not in date:
date += "-XX-XX" # note this already exist in distrowatch input
return date.replace("XX", "01")
structure[strings.dates] = list(map(
sanatizeDate,
distrosoup.find_all("td", class_="Date")
))
url = "%s/%s" % (baseurl, anchor.parent.find_all('img')[-1]['src'])
print("using image: %s" % url)
image = session.get(url, stream=True)
image_name = "%s.png" % aname
with open(image_name, 'wb') as ifile:
image.raw.decode_content = True
copyfileobj(image.raw, ifile)
structure[strings.image] = image_name
return jsondumps(structure)
def fetch_dist_list_from(baseurl, search_options):
# for debugging...
def tohtml(lines, outFile="output.html"):
with open("out/%s" % outFile, "w", encoding='utf8') as f:
f.writelines(lines)
session = Session()
website = session.get('%s/search.php?%s' % (baseurl, search_options)).text
searchSoup = BeautifulSoup(website)
def tagfilter(tag):
return tag.name == "b" and match("[0-9]+\.", tag.text)
# TODO Why are we creating THIS as a json string here.
# Why not jsut return a python array
result = "["
# some missing root elements
godfathers = [
["android", "2008-10-23"]
]
for godfather in godfathers:
result += jsondumps({
strings.name: godfather[0],
strings.based: strings.independend,
strings.dates: [godfather[1]],
strings.status: strings.active,
strings.image: ""
}) + ","
from multiprocessing import Pool
pool = Pool(8) # sub interpreters to use
foundDistributions = searchSoup.find_all(tagfilter)
result += ",".join(pool.map(
fetch_details,
zip([baseurl
for x
in foundDistributions],
[str(x)
for x
in foundDistributions])
))
return result + "]"