-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
105 lines (85 loc) · 3.15 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import urllib.request
import csv
import json
import datetime
import time as t
from bs4 import BeautifulSoup
from collections import namedtuple
from dataclasses import dataclass
def get_article_links(
start_year: int, end_year: int, start_issue: int, end_issue: int, scraper_wait: int
) -> list:
"""Gets all the article links from an given issure
Args:
year (int): publishing year
issue (int): issue number
Returns:
list: list of all article links as string
"""
article_links = []
for year in range(start_year, end_year + 1):
for issue in range(start_issue, end_issue + 1):
with urllib.request.urlopen(
f"https://www.spiegel.de/spiegel/print/index-{year}-{issue}.html"
) as response:
html = response.read()
for a in BeautifulSoup(html, "html.parser").findAll("article"):
article_links.append(a.find("a")["href"])
t.sleep(scraper_wait)
return article_links
def save_article_links_to_csv(article_links: list, path="./article_links.csv") -> None:
"""_summary_
Args:
article_links (list): list of article links generated by get_article_links()
path (str, optional): path for csv file. Defaults to "./article_links.csv".
"""
with open(path, "w", encoding="UTF8", newline="") as f:
writer = csv.writer(f)
writer.writerows(article_links)
def read_article_links_from_csv(path="./article_links.csv") -> list:
"""reads article links from csv file
Args:
path (str, optional): path of the csv file to read. Defaults to "./article_links.csv".
Returns:
list: list of article links
"""
try:
with open("article_links_complete.csv", newline="") as f:
article_links = list(csv.reader(f))
except:
print("An error occured while reading the file")
else:
return article_links
@dataclass
class Article:
link: str
publishing_date: datetime
# issue_year: int
# issue_number: int
keywords: str
author: str
# title: str
description: str
body: str = ""
def __init__(self, article_link: str) -> None:
with urllib.request.urlopen(article_link) as response:
html = response.read()
article_soup = BeautifulSoup(html, "html.parser")
paragraphs = article_soup.findAll("p", class_=False)
for paragraph in paragraphs:
self.body += paragraph.text
self.link = article_link
self.publishing_date = article_soup.find("meta", {"name": "date"})["content"]
self.keywords = article_soup.find("meta", {"name": "news_keywords"})["content"]
self.author = article_soup.find("meta", {"name": "author"})["content"]
# self.title = article_link.article_title
self.description = article_soup.find("meta", {"name": "description"})["content"]
def to_dict(self) -> dict:
return {
"link": self.link,
"publishing_date": self.publishing_date,
"keywords": self.keywords,
"author": self.author,
"description": self.description,
"body": self.body,
}