fix minor issues

ofou · Jun 19, 2024 · 37ce446 · 37ce446
1 parent 2e5d420
commit 37ce446
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 49 deletions.
diff --git a/.gitignore b/.gitignore
@@ -142,3 +142,4 @@ graham.epub
 Pipfile
 .vscode
 .DS_Store
+essays.csv
diff --git a/graham.py b/graham.py
@@ -1,11 +1,8 @@
-from asyncio.log import logger
-import feedparser
 import urllib.request
 from urllib.parse import urljoin
 import time
 import os.path
 import html2text
-import unidecode
 import regex as re
 from htmldate import find_date
 import csv
@@ -23,16 +20,16 @@
 h.reference_links = True
 h.mark_code = True
 
-ART_NO = 1
+ART_NO = 0  # Initialize to 0 so the first entry is 001
 FILE = "./essays.csv"
 
-if ART_NO == 1:
-    if os.path.isfile(FILE):
-        os.remove(FILE)
+if os.path.isfile(FILE):
+    os.remove(FILE)
 
 
 def parse_main_page(base_url: str, articles_url: str):
-    assert base_url.endswith("/"), f"Base URL must end with a slash: {base_url}"
+    assert base_url.endswith(
+        "/"), f"Base URL must end with a slash: {base_url}"
     response = requests.get(base_url + articles_url)
     soup = BeautifulSoup(response.text, "html.parser")
 
@@ -47,15 +44,17 @@ def parse_main_page(base_url: str, articles_url: str):
             a_tag = td.find("font").find("a") if td.find("font") else None
             if a_tag:
                 chapter_links.append(
-                    {"link": urljoin(base_url, a_tag["href"]), "title": a_tag.text}
+                    {"link": urljoin(
+                        base_url, a_tag["href"]), "title": a_tag.text}
                 )
 
     return chapter_links
 
 
+toc = list(reversed(parse_main_page("https://paulgraham.com/", "articles.html")))
+
 # rss = feedparser.parse("http://www.aaronsw.com/2002/feeds/pgessays.rss")
 # toc = reversed(rss.entries)
-toc = reversed(parse_main_page("https://paulgraham.com/", "articles.html"))
 
 
 def update_links_in_md(joined):
@@ -84,52 +83,59 @@ def update_links(match):
     return joined
 
 
+# Write the header to the CSV file only once
+with open(FILE, "a+", newline="\n") as f:
+    fieldnames = ["Article no.", "Title", "Date", "URL"]
+    csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
+    csvwriter.writeheader()
+
 for entry in toc:
+    ART_NO += 1
     URL = entry["link"]
     if "http://www.paulgraham.com/https://" in URL:
         URL = URL.replace("http://www.paulgraham.com/https://", "https://")
     TITLE = entry["title"]
 
     try:
-        with urllib.request.urlopen(URL) as website:
-            content = website.read().decode(r"unicode_escape", "utf-8")
-            parsed = h.handle(content)
-            title = "_".join(TITLE.split(" ")).lower()
-            title = re.sub(r"[\W\s]+", "", title)
-            with open(f"./essays/{ART_NO:03}_{title}.md", "wb+") as file:
-                file.write(f"# {ART_NO:03} {TITLE}\n\n".encode())
-                parsed = parsed.replace("[](index.html)  \n  \n", "")
-
-                parsed = [
-                    (
-                        p.replace("\n", " ")
-                        if re.match(r"^[\p{Z}\s]*(?:[^\p{Z}\s][\p{Z}\s]*){5,100}$", p)
-                        else "\n" + p + "\n"
-                    )
-                    for p in parsed.split("\n")
-                ]
-
-                encoded = " ".join(parsed).encode()
-                update_with_links = update_links_in_md(encoded)
-                file.write(update_with_links)
-
-                print(f"✅ {ART_NO:03} {TITLE}")
-
-                with open(FILE, "a+", newline="\n") as f:
-                    csvwriter = csv.writer(
-                        f, quoting=csv.QUOTE_MINIMAL, delimiter=",", quotechar='"'
-                    )
-
-                    if ART_NO == 1:
-                        fieldnames = ["Article no.", "Title", "Date", "URL"]
-                        csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
-                        csvwriter.writeheader()
-
-                    line = [ART_NO, TITLE, DATE, URL]
-
-                    csvwriter.writerow(line)
+        try:
+            with urllib.request.urlopen(URL) as website:
+                content = website.read().decode("utf-8")
+        except UnicodeDecodeError:
+            with urllib.request.urlopen(URL) as website:
+                content = website.read().decode("latin-1")
+
+        parsed = h.handle(content)
+        title = "_".join(TITLE.split(" ")).lower()
+        title = re.sub(r"[\W\s]+", "", title)
+        DATE = find_date(URL)
+        with open(f"./essays/{str(ART_NO).zfill(3)}_{title}.md", "wb+") as file:
+            file.write(f"# {str(ART_NO).zfill(3)} {TITLE}\n\n".encode())
+            parsed = parsed.replace("[](index.html)  \n  \n", "")
+
+            parsed = [
+                (
+                    p.replace("\n", " ")
+                    if re.match(r"^[\p{Z}\s]*(?:[^\p{Z}\s][\p{Z}\s]*){5,100}$", p)
+                    else "\n" + p + "\n"
+                )
+                for p in parsed.split("\n")
+            ]
+
+            encoded = " ".join(parsed).encode()
+            update_with_links = update_links_in_md(encoded)
+            file.write(update_with_links)
+
+            print(f"✅ {str(ART_NO).zfill(3)} {TITLE}")
+
+            with open(FILE, "a+", newline="\n") as f:
+                csvwriter = csv.writer(
+                    f, quoting=csv.QUOTE_MINIMAL, delimiter=",", quotechar='"'
+                )
+
+                line = [str(ART_NO).zfill(3), TITLE, DATE, URL]
+
+                csvwriter.writerow(line)
 
     except Exception as e:
-        print(f"❌ {ART_NO:03} {entry['title']}, ({e})")
-    ART_NO += 1
+        print(f"❌ {str(ART_NO).zfill(3)} {entry['title']}, ({e})")
     time.sleep(0.05)  # half sec/article is ~2min, be nice with servers!
-Original file line number
+Diff line change
@@ Expand Up / @@ -142,3 +142,4 @@ graham.epub @@
     Pipfile
     .vscode
     .DS_Store
+    essays.csv