Skip to content

Commit

Permalink
fix minor issues
Browse files Browse the repository at this point in the history
  • Loading branch information
ofou committed Jun 19, 2024
1 parent 2e5d420 commit 37ce446
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 49 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,4 @@ graham.epub
Pipfile
.vscode
.DS_Store
essays.csv
104 changes: 55 additions & 49 deletions graham.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from asyncio.log import logger
import feedparser
import urllib.request
from urllib.parse import urljoin
import time
import os.path
import html2text
import unidecode
import regex as re
from htmldate import find_date
import csv
Expand All @@ -23,16 +20,16 @@
h.reference_links = True
h.mark_code = True

ART_NO = 1
ART_NO = 0 # Initialize to 0 so the first entry is 001
FILE = "./essays.csv"

if ART_NO == 1:
if os.path.isfile(FILE):
os.remove(FILE)
if os.path.isfile(FILE):
os.remove(FILE)


def parse_main_page(base_url: str, articles_url: str):
assert base_url.endswith("/"), f"Base URL must end with a slash: {base_url}"
assert base_url.endswith(
"/"), f"Base URL must end with a slash: {base_url}"
response = requests.get(base_url + articles_url)
soup = BeautifulSoup(response.text, "html.parser")

Expand All @@ -47,15 +44,17 @@ def parse_main_page(base_url: str, articles_url: str):
a_tag = td.find("font").find("a") if td.find("font") else None
if a_tag:
chapter_links.append(
{"link": urljoin(base_url, a_tag["href"]), "title": a_tag.text}
{"link": urljoin(
base_url, a_tag["href"]), "title": a_tag.text}
)

return chapter_links


toc = list(reversed(parse_main_page("https://paulgraham.com/", "articles.html")))

# rss = feedparser.parse("http://www.aaronsw.com/2002/feeds/pgessays.rss")
# toc = reversed(rss.entries)
toc = reversed(parse_main_page("https://paulgraham.com/", "articles.html"))


def update_links_in_md(joined):
Expand Down Expand Up @@ -84,52 +83,59 @@ def update_links(match):
return joined


# Write the header to the CSV file only once
with open(FILE, "a+", newline="\n") as f:
fieldnames = ["Article no.", "Title", "Date", "URL"]
csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
csvwriter.writeheader()

for entry in toc:
ART_NO += 1
URL = entry["link"]
if "http://www.paulgraham.com/https://" in URL:
URL = URL.replace("http://www.paulgraham.com/https://", "https://")
TITLE = entry["title"]

try:
with urllib.request.urlopen(URL) as website:
content = website.read().decode(r"unicode_escape", "utf-8")
parsed = h.handle(content)
title = "_".join(TITLE.split(" ")).lower()
title = re.sub(r"[\W\s]+", "", title)
with open(f"./essays/{ART_NO:03}_{title}.md", "wb+") as file:
file.write(f"# {ART_NO:03} {TITLE}\n\n".encode())
parsed = parsed.replace("[](index.html) \n \n", "")

parsed = [
(
p.replace("\n", " ")
if re.match(r"^[\p{Z}\s]*(?:[^\p{Z}\s][\p{Z}\s]*){5,100}$", p)
else "\n" + p + "\n"
)
for p in parsed.split("\n")
]

encoded = " ".join(parsed).encode()
update_with_links = update_links_in_md(encoded)
file.write(update_with_links)

print(f"✅ {ART_NO:03} {TITLE}")

with open(FILE, "a+", newline="\n") as f:
csvwriter = csv.writer(
f, quoting=csv.QUOTE_MINIMAL, delimiter=",", quotechar='"'
)

if ART_NO == 1:
fieldnames = ["Article no.", "Title", "Date", "URL"]
csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
csvwriter.writeheader()

line = [ART_NO, TITLE, DATE, URL]

csvwriter.writerow(line)
try:
with urllib.request.urlopen(URL) as website:
content = website.read().decode("utf-8")
except UnicodeDecodeError:
with urllib.request.urlopen(URL) as website:
content = website.read().decode("latin-1")

parsed = h.handle(content)
title = "_".join(TITLE.split(" ")).lower()
title = re.sub(r"[\W\s]+", "", title)
DATE = find_date(URL)
with open(f"./essays/{str(ART_NO).zfill(3)}_{title}.md", "wb+") as file:
file.write(f"# {str(ART_NO).zfill(3)} {TITLE}\n\n".encode())
parsed = parsed.replace("[](index.html) \n \n", "")

parsed = [
(
p.replace("\n", " ")
if re.match(r"^[\p{Z}\s]*(?:[^\p{Z}\s][\p{Z}\s]*){5,100}$", p)
else "\n" + p + "\n"
)
for p in parsed.split("\n")
]

encoded = " ".join(parsed).encode()
update_with_links = update_links_in_md(encoded)
file.write(update_with_links)

print(f"✅ {str(ART_NO).zfill(3)} {TITLE}")

with open(FILE, "a+", newline="\n") as f:
csvwriter = csv.writer(
f, quoting=csv.QUOTE_MINIMAL, delimiter=",", quotechar='"'
)

line = [str(ART_NO).zfill(3), TITLE, DATE, URL]

csvwriter.writerow(line)

except Exception as e:
print(f"❌ {ART_NO:03} {entry['title']}, ({e})")
ART_NO += 1
print(f"❌ {str(ART_NO).zfill(3)} {entry['title']}, ({e})")
time.sleep(0.05) # half sec/article is ~2min, be nice with servers!

0 comments on commit 37ce446

Please sign in to comment.