Skip to content

Commit

Permalink
updated all instances of os.path and path.py to use pathlib
Browse files Browse the repository at this point in the history
  • Loading branch information
MUCCHU committed Dec 20, 2023
1 parent 2fd98ed commit c3d1eda
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 70 deletions.
30 changes: 15 additions & 15 deletions src/gutenberg2zim/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import pathlib
from pathlib import Path
import shutil
import tempfile
import zipfile
Expand All @@ -9,7 +9,6 @@
import apsw
import backoff
from kiwixstorage import KiwixStorage
from path import Path

from gutenberg2zim.constants import TMP_FOLDER, logger
from gutenberg2zim.database import Book, BookFormat
Expand All @@ -36,15 +35,15 @@
# return False


def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path):
def handle_zipped_epub(zippath, book, dst_dir: Path):
def clfn(fn):
return os.path.join(*os.path.split(fn)[1:])
return Path(*Path(fn).parts[1:])

def is_safe(fname):
fname = ensure_unicode(clfn(fname))
if Path(fname).basename() == fname:
if Path(fname).name == fname:
return True
return fname == os.path.join("images", Path(fname).splitpath()[-1])
return fname == Path("images") / Path(fname).name

zipped_files = []
# create temp directory to extract to
Expand All @@ -53,7 +52,7 @@ def is_safe(fname):
with zipfile.ZipFile(zippath, "r") as zf:
# check that there is no insecure data (absolute names)
if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]):
Path(tmpd).rmtree_p()
shutil.rmtree(tmpd)
return False
# zipped_files = [clfn(fn) for fn in zf.namelist()]
zipped_files = zf.namelist()
Expand All @@ -73,12 +72,12 @@ def is_safe(fname):
# move all extracted files to proper locations
for zipped_file in zipped_files:
# skip folders
if not Path(zipped_file).ext:
if not Path(zipped_file).suffix:
continue

src = os.path.join(tmpd, zipped_file)
if os.path.exists(src):
fname = Path(zipped_file).basename()
src = Path(tmpd) / zipped_file
if Path(src).exists():
fname = Path(zipped_file).name

if fname.endswith(".html") or fname.endswith(".htm"):
if mhtml:
Expand All @@ -91,7 +90,7 @@ def is_safe(fname):
else:
dst = dst_dir.joinpath(f"{book.id}_{fname}")
try:
Path(src).move(str(dst))
Path(src).rename(dst)
except Exception as e:
import traceback

Expand All @@ -102,7 +101,8 @@ def is_safe(fname):
# delete temp directory and zipfile
if Path(zippath).exists():
os.unlink(zippath)
Path(tmpd).rmtree_p()
shutil.rmtree(tmpd)



def download_book(
Expand All @@ -124,7 +124,7 @@ def download_book(
if "html" not in formats:
formats.append("html")

book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
book_dir = Path(download_cache).joinpath(str(book.id))
optimized_dir = book_dir.joinpath("optimized")
unoptimized_dir = book_dir.joinpath("unoptimized")
unsuccessful_formats = []
Expand Down Expand Up @@ -372,7 +372,7 @@ def download_all_books(
)

# ensure dir exist
Path(download_cache).mkdir_p()
Path(download_cache).mkdir(parents=True, exist_ok=True)

def backoff_busy_error_hdlr(details):
logger.warning(
Expand Down
7 changes: 3 additions & 4 deletions src/gutenberg2zim/entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import sys

from pathlib import Path
from docopt import docopt
from path import Path

from gutenberg2zim.checkdeps import check_dependencies
from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger
Expand Down Expand Up @@ -90,7 +89,7 @@ def main():
arguments.get("--rdf-url")
or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
)
dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache")
dl_cache = arguments.get("--dl-folder") or "dl-cache"
books_csv = arguments.get("--books") or ""
zim_title = arguments.get("--zim-title")
zim_desc = arguments.get("--zim-desc")
Expand Down Expand Up @@ -209,7 +208,7 @@ def f(x):
if do_zim:
logger.info("BUILDING ZIM dynamically")
build_zimfile(
output_folder=Path(one_lang_one_zim_folder or ".").abspath(),
output_folder=Path(one_lang_one_zim_folder or ".").resolve(),
download_cache=dl_cache,
concurrency=concurrency,
languages=zim_lang,
Expand Down
41 changes: 22 additions & 19 deletions src/gutenberg2zim/export.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import os
import pathlib
from pathlib import Path
import shutil
import tempfile
import traceback
Expand All @@ -11,7 +11,6 @@
import bs4
from bs4 import BeautifulSoup
from jinja2 import Environment, PackageLoader
from path import Path
from schedule import every
from six import text_type
from zimscraperlib.image.transformation import resize_image
Expand Down Expand Up @@ -95,7 +94,7 @@ def save_bs_output(soup, fpath, encoding=UTF8):


def tmpl_path():
return os.path.join(Path(gutenberg2zim.__file__).parent, "templates")
return Path(gutenberg2zim.__file__).parent / "templates"


def get_list_of_all_languages():
Expand All @@ -105,8 +104,8 @@ def get_list_of_all_languages():
def export_illustration():
logger.info("Adding illustration")

src_illus_fpath = pathlib.Path(tmpl_path(), "favicon.png")
tmp_illus_fpath = pathlib.Path(TMP_FOLDER_PATH, "illustration.png")
src_illus_fpath = Path(tmpl_path(), "favicon.png")
tmp_illus_fpath = Path(TMP_FOLDER_PATH, "illustration.png")

shutil.copy(src_illus_fpath, tmp_illus_fpath)

Expand Down Expand Up @@ -152,18 +151,18 @@ def export_skeleton(
"datatables",
"fonts",
):
src = os.path.join(src_folder, fname)
src = Path(src_folder) / fname

# recursively add our assets, at a path identical to position in repo
assets_root = pathlib.Path(src)
assets_root = Path(src)
if assets_root.is_file():
Global.add_item_for(path=fname, fpath=assets_root)
else:
for fpath in assets_root.glob("**/*"):
if not fpath.is_file() or fpath.name == "l10n.js":
continue
path = str(fpath.relative_to(src))
Global.add_item_for(path=os.path.join(fname, path), fpath=fpath)
Global.add_item_for(path=str(Path(fname) / path), fpath=fpath)

# export homepage
tpl_path = "Home.html"
Expand Down Expand Up @@ -273,7 +272,7 @@ def nb_by_fmt(fmt):
def dlb(b):
export_book(
b,
book_dir=pathlib.Path(download_cache).joinpath(str(b.id)),
book_dir=Path(download_cache).joinpath(str(b.id)),
formats=formats,
books=books,
project_id=project_id,
Expand Down Expand Up @@ -711,23 +710,23 @@ def optimize_epub(src, dst):

remove_cover = False
for fname in zipped_files:
fnp = os.path.join(tmpd, fname)
if Path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):
fnp = Path(tmpd) / fname
if Path(fname).suffix in (".png", ".jpeg", ".jpg", ".gif"):
# special case to remove ugly cover
if fname.endswith("cover.jpg") and is_bad_cover(fnp):
zipped_files.remove(fname)
remove_cover = True
else:
optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)
optimize_image(Path(fnp), Path(fnp), force=True)

if Path(fname).ext in (".htm", ".html"):
if Path(fname).suffix in (".htm", ".html"):
html_content, _ = read_file(fnp)
html = update_html_for_static(
book=book, html_content=html_content, formats=formats, epub=True
)
save_bs_output(html, fnp, UTF8)

if Path(fname).ext == ".ncx":
if Path(fname).suffix == ".ncx":
pattern = "*** START: FULL LICENSE ***"
ncx, _ = read_file(fnp)
soup = BeautifulSoup(ncx, "lxml-xml")
Expand All @@ -744,11 +743,15 @@ def optimize_epub(src, dst):
# delete {id}/cover.jpg if exist and update {id}/content.opf
if remove_cover:
# remove cover
Path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()
file_path = Path(tmpd) / text_type(book.id) / "cover.jpg"
try:
file_path.unlink()
except FileNotFoundError:
pass

soup = None
opff = os.path.join(tmpd, text_type(book.id), "content.opf")
if os.path.exists(opff):
opff = Path(tmpd) / text_type(book.id) / "content.opf"
if Path(opff).exists():
opff_content, _ = read_file(opff)
soup = BeautifulSoup(opff_content, "lxml-xml")

Expand All @@ -761,7 +764,7 @@ def optimize_epub(src, dst):
# bundle epub as zip
zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

Path(tmpd).rmtree_p()
shutil.rmtree(tmpd)

def handle_companion_file(
fname,
Expand Down Expand Up @@ -821,7 +824,7 @@ def handle_companion_file(
as_ext=".zip",
)
else:
Path(tmp_epub.name).move(str(dst))
tmp_epub.rename(Path(dst) / tmp_epub.name)
Global.add_item_for(path=dstfname, fpath=dst)
if s3_storage:
upload_to_cache(
Expand Down
14 changes: 7 additions & 7 deletions src/gutenberg2zim/rdf.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import pathlib
import re
import tarfile

from pathlib import Path
import peewee
from bs4 import BeautifulSoup

Expand All @@ -18,7 +17,7 @@

def get_rdf_fpath():
fname = "rdf-files.tar.bz2"
fpath = pathlib.Path(fname).resolve()
fpath = Path(fname).resolve()
return fpath


Expand All @@ -38,7 +37,7 @@ def parse_and_fill(rdf_path, only_books):
rdf_tarfile = tarfile.open(name=rdf_path, mode="r|bz2")

for rdf_member in rdf_tarfile:
rdf_member_path = pathlib.Path(rdf_member.name)
rdf_member_path = Path(rdf_member.name)

# skip books outside of requested list
if (
Expand Down Expand Up @@ -297,9 +296,10 @@ def get_formatted_number(num):
nums = [f"{i:0=5d}" for i in range(21000, 40000)]
for num in nums:
print(num) # noqa: T201
curd = os.path.dirname(os.path.realpath(__file__))
rdf = os.path.join(curd, "..", "rdf-files", num, "pg" + num + ".rdf")
if os.path.isfile(rdf):
curd = Path(__file__).resolve().parent
rdf = curd.parent / "rdf-files" / num / f"pg{num}.rdf"

if rdf.is_file():
data = ""
with open(rdf) as f:
data = f.read()
Expand Down
41 changes: 20 additions & 21 deletions src/gutenberg2zim/urls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import urllib.parse as urlparse
from collections import defaultdict
from pathlib import Path

from gutenberg2zim.constants import TMP_FOLDER_PATH, logger
from gutenberg2zim.database import Book, BookFormat, Url
Expand Down Expand Up @@ -41,14 +42,12 @@ def build(self):
"""
if self.base == self.BASE_ONE:
if int(self.b_id) > 10: # noqa: PLR2004
base_url = os.path.join(
os.path.join(*list(str(self.b_id))[:-1]), str(self.b_id)
)
base_url = Path(*list(str(self.b_id)[:-1])) / str(self.b_id)
else:
base_url = os.path.join(os.path.join("0", str(self.b_id)))
url = os.path.join(self.base, base_url)
base_url = Path("0") / str(self.b_id)
url = Path(self.base) / base_url
elif self.base == self.BASE_TWO:
url = os.path.join(self.base, str(self.b_id))
url = Path(self.base) / str(self.b_id)
elif self.base == self.BASE_THREE:
url = self.base
return url # type: ignore
Expand Down Expand Up @@ -146,9 +145,9 @@ def build_epub(files):
return []

name = "".join(["pg", b_id])
url = os.path.join(u.build(), name + ".epub")
url_images = os.path.join(u.build(), name + "-images.epub")
url_noimages = os.path.join(u.build(), name + "-noimages.epub")
url = Path(u.build()) / f"{name}.epub"
url_images = Path(u.build()) / f"{name}-images.epub"
url_noimages = Path(u.build()) / f"{name}-noimages.epub"
urls.extend([url, url_images, url_noimages])
return urls

Expand All @@ -172,13 +171,13 @@ def build_pdf(files):

for i in files:
if "images" not in i["name"]:
url = os.path.join(u.build(), i["name"])
url = Path(u.build()) / i["name"]
urls.append(url)

url_dash1 = os.path.join(u1.build(), b_id + "-" + "pdf" + ".pdf")
url_dash = os.path.join(u.build(), b_id + "-" + "pdf" + ".pdf")
url_normal = os.path.join(u.build(), b_id + ".pdf")
url_pg = os.path.join(u.build(), "pg" + b_id + ".pdf")
url_dash1 = Path(u1.build()) / f"{b_id}-pdf.pdf"
url_dash = Path(u.build()) / f"{b_id}-pdf.pdf"
url_normal = Path(u.build()) / f"{b_id}.pdf"
url_pg = Path(u.build()) / f"pg{b_id}.pdf"

urls.extend([url_dash, url_normal, url_pg, url_dash1])
return list(set(urls))
Expand All @@ -199,17 +198,17 @@ def build_html(files):

if all(["-h.html" not in file_names, "-h.zip" in file_names]):
for i in files:
url = os.path.join(u.build(), i["name"])
url = Path(u.build()) / i["name"]
urls.append(url)

url_zip = os.path.join(u.build(), b_id + "-h" + ".zip")
# url_utf8 = os.path.join(u.build(), b_id + '-8' + '.zip')
url_html = os.path.join(u.build(), b_id + "-h" + ".html")
url_htm = os.path.join(u.build(), b_id + "-h" + ".htm")
url_zip = Path(u.build()) / f"{b_id}-h.zip"
# url_utf8 = Path(u.build()) / f"{b_id}-8.zip"
url_html = Path(u.build()) / f"{b_id}-h.html"
url_htm = Path(u.build()) / f"{b_id}-h.htm"

u.with_base(UrlBuilder.BASE_TWO)
name = "".join(["pg", b_id])
html_utf8 = os.path.join(u.build(), name + ".html.utf8")
html_utf8 = Path(u.build()) / f"{name}.html.utf8"

u.with_base(UrlBuilder.BASE_THREE)
file_index = index_of_substring(files, ["html", "htm"])
Expand All @@ -220,7 +219,7 @@ def build_html(files):
etext_names = [f"{i:0=2d}" for i in etext_nums]
etext_urls = []
for i in etext_names:
etext_urls.append(os.path.join(u.build() + i, file_name))
etext_urls.append(Path(u.build(), i, file_name))

urls.extend([url_zip, url_htm, url_html, html_utf8])
urls.extend(etext_urls)
Expand Down
Loading

0 comments on commit c3d1eda

Please sign in to comment.