-
Notifications
You must be signed in to change notification settings - Fork 1
/
johnsina.py
30 lines (23 loc) · 924 Bytes
/
johnsina.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from bs4 import BeautifulSoup
import urllib.request
import re
x = str(input("Enter a number, x being the value in http://blog.sina.com.cn/s/article_sort_x_10001_1.html: "))
howmanypages = int(input("How many pages? "))+1
for page in range(1, howmanypages):
url = 'http://blog.sina.com.cn/s/article_sort_' + x + '_' \
+ '10001_' + str(page) + '.html'
webpage = urllib.request.urlopen(url)
soup = BeautifulSoup(webpage, 'html.parser')
link = soup.find('div', attrs={'class': 'tag SG_txtc'})
links = soup.find_all('div', {'class': 'tag SG_txtc'})
print('Page {}: '.format(str(page)) + url + '\n')
for link in list(links):
print("Post: "+link.a['href'])
print()
myfile = open('posts.txt', 'a')
for link in links:
myfile.write("%s\n" % link.a['href'])
myfile.close()
file2 = open('pages.txt', 'a')
file2.write("%s\n" % url)
file2.close()