helpers.py

import urllib.request
import csv
import json
import datetime
import time as t
from bs4 import BeautifulSoup
from collections import namedtuple
from dataclasses import dataclass


def get_article_links(
    start_year: int, end_year: int, start_issue: int, end_issue: int, scraper_wait: int
) -> list:
    """Gets all the article links from an given issure

    Args:
        year (int): publishing year
        issue (int): issue number

    Returns:
        list: list of all article links as string
    """
    article_links = []

    for year in range(start_year, end_year + 1):
        for issue in range(start_issue, end_issue + 1):

            with urllib.request.urlopen(
                f"https://www.spiegel.de/spiegel/print/index-{year}-{issue}.html"
            ) as response:
                html = response.read()

            for a in BeautifulSoup(html, "html.parser").findAll("article"):
                article_links.append(a.find("a")["href"])

            t.sleep(scraper_wait)

    return article_links


def save_article_links_to_csv(article_links: list, path="./article_links.csv") -> None:
    """_summary_

    Args:
        article_links (list): list of article links generated by get_article_links()
        path (str, optional): path for csv file. Defaults to "./article_links.csv".
    """
    with open(path, "w", encoding="UTF8", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(article_links)


def read_article_links_from_csv(path="./article_links.csv") -> list:
    """reads article links from csv file

    Args:
        path (str, optional): path of the csv file to read. Defaults to "./article_links.csv".

    Returns:
        list: list of article links
    """
    try:
        with open("article_links_complete.csv", newline="") as f:
            article_links = list(csv.reader(f))
    except:
        print("An error occured while reading the file")
    else:
        return article_links


@dataclass
class Article:
    link: str
    publishing_date: datetime
    # issue_year: int
    # issue_number: int
    keywords: str
    author: str
    # title: str
    description: str
    body: str = ""

    def __init__(self, article_link: str) -> None:
        with urllib.request.urlopen(article_link) as response:
            html = response.read()
        article_soup = BeautifulSoup(html, "html.parser")
        paragraphs = article_soup.findAll("p", class_=False)
        for paragraph in paragraphs:
            self.body += paragraph.text
        self.link = article_link
        self.publishing_date = article_soup.find("meta", {"name": "date"})["content"]
        self.keywords = article_soup.find("meta", {"name": "news_keywords"})["content"]
        self.author = article_soup.find("meta", {"name": "author"})["content"]
        # self.title = article_link.article_title
        self.description = article_soup.find("meta", {"name": "description"})["content"]

    def to_dict(self) -> dict:
        return {
            "link": self.link,
            "publishing_date": self.publishing_date,
            "keywords": self.keywords,
            "author": self.author,
            "description": self.description,
            "body": self.body,
        }