-
Notifications
You must be signed in to change notification settings - Fork 1
/
issuu.py
62 lines (53 loc) · 2.18 KB
/
issuu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bs4 import BeautifulSoup
import sys
import wget
import os
import urllib.request
import re
import json
import subprocess
x = str(input("Enter the url: "))
howmanypages = int(input("How many pages? "))+1
def GetPages():
for page in range(1, howmanypages):
webpage = urllib.request.urlopen(x)
soup = BeautifulSoup(webpage, 'html.parser')
pagetitle = soup.find('meta', attrs={'property': 'og:title'})['content']
imglink3 = soup.find('meta', attrs={'property': 'og:image'})['content']
imglink2 = imglink3.replace('1.jpg','')
imglink = imglink2 + str(page) + ".jpg"
getimg = wget.download(imglink)
print('Page {}: '.format(str(page)) + imglink + '\n')
myfile = open('urls.txt', 'a')
myfile.write("%s\n" % imglink)
myfile.close()
params = ['convert', 'page_*', pagetitle.replace("/",".") + '.pdf']
subprocess.check_call(params)
os.system('rm page_*')
def Dict():
webpage = urllib.request.urlopen(x)
soup = BeautifulSoup(webpage, 'html.parser')
fnameappend = '.info.json'
jsonenc = json.JSONEncoder()
pagetitle = soup.find('meta', attrs={'property': 'og:title'})['content']
dict = {'origurl': x, \
'title': pagetitle, \
'description': None, \
'uploader': None, \
'uploaderlink':None,\
'uploaded': None}
dict['description'] = soup.find('meta', attrs={'property': 'og:description'})['content']
dict['uploaded'] = soup.find('div', attrs={'class': 'DocumentInfo__date--2llaY'})['datetime']
uploader = soup.find('div', attrs={'class': 'PublisherInfo__name--3j27Y'})
dict['uploader'] = uploader.text.strip()
dict['uploaderlink'] = "https://issuu.com/" + dict['uploader']
webpage2 = urllib.request.urlopen(dict['uploaderlink'])
soup2 = BeautifulSoup(webpage2, 'html.parser')
dict['uploadername'] = soup2.find('meta', attrs={'property': 'og:title'})['content']
filename=pagetitle.translate(str.maketrans("*/\\<>:\"|","--------")).strip()+ fnameappend
print(dict)
myfile2 = open(filename, 'w')
myfile2.write(jsonenc.encode({dict['title']: dict}))
myfile2.close()
GetPages()
Dict()