updated all instances of os.path and path.py to use pathlib

openzim · Dec 20, 2023 · c3d1eda · c3d1eda
1 parent 2fd98ed
commit c3d1eda
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 70 deletions.
diff --git a/src/gutenberg2zim/download.py b/src/gutenberg2zim/download.py
@@ -1,5 +1,5 @@
 import os
-import pathlib
+from pathlib import Path
 import shutil
 import tempfile
 import zipfile
@@ -9,7 +9,6 @@
 import apsw
 import backoff
 from kiwixstorage import KiwixStorage
-from path import Path
 
 from gutenberg2zim.constants import TMP_FOLDER, logger
 from gutenberg2zim.database import Book, BookFormat
@@ -36,15 +35,15 @@
 #         return False
 
 
-def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path):
+def handle_zipped_epub(zippath, book, dst_dir: Path):
     def clfn(fn):
-        return os.path.join(*os.path.split(fn)[1:])
+        return Path(*Path(fn).parts[1:])
 
     def is_safe(fname):
         fname = ensure_unicode(clfn(fname))
-        if Path(fname).basename() == fname:
+        if Path(fname).name == fname:
             return True
-        return fname == os.path.join("images", Path(fname).splitpath()[-1])
+        return fname == Path("images") / Path(fname).name
 
     zipped_files = []
     # create temp directory to extract to
@@ -53,7 +52,7 @@ def is_safe(fname):
         with zipfile.ZipFile(zippath, "r") as zf:
             # check that there is no insecure data (absolute names)
             if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]):
-                Path(tmpd).rmtree_p()
+                shutil.rmtree(tmpd)
                 return False
             # zipped_files = [clfn(fn) for fn in zf.namelist()]
             zipped_files = zf.namelist()
@@ -73,12 +72,12 @@ def is_safe(fname):
     # move all extracted files to proper locations
     for zipped_file in zipped_files:
         # skip folders
-        if not Path(zipped_file).ext:
+        if not Path(zipped_file).suffix:
             continue
 
-        src = os.path.join(tmpd, zipped_file)
-        if os.path.exists(src):
-            fname = Path(zipped_file).basename()
+        src = Path(tmpd) / zipped_file
+        if Path(src).exists():
+            fname = Path(zipped_file).name
 
             if fname.endswith(".html") or fname.endswith(".htm"):
                 if mhtml:
@@ -91,7 +90,7 @@ def is_safe(fname):
             else:
                 dst = dst_dir.joinpath(f"{book.id}_{fname}")
             try:
-                Path(src).move(str(dst))
+                Path(src).rename(dst)
             except Exception as e:
                 import traceback
 
@@ -102,7 +101,8 @@ def is_safe(fname):
     # delete temp directory and zipfile
     if Path(zippath).exists():
         os.unlink(zippath)
-    Path(tmpd).rmtree_p()
+    shutil.rmtree(tmpd)
+
 
 
 def download_book(
@@ -124,7 +124,7 @@ def download_book(
     if "html" not in formats:
         formats.append("html")
 
-    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
+    book_dir = Path(download_cache).joinpath(str(book.id))
     optimized_dir = book_dir.joinpath("optimized")
     unoptimized_dir = book_dir.joinpath("unoptimized")
     unsuccessful_formats = []
@@ -372,7 +372,7 @@ def download_all_books(
     )
 
     # ensure dir exist
-    Path(download_cache).mkdir_p()
+    Path(download_cache).mkdir(parents=True, exist_ok=True)
 
     def backoff_busy_error_hdlr(details):
         logger.warning(

diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py
@@ -1,8 +1,7 @@
 import os
 import sys
-
+from pathlib import Path
 from docopt import docopt
-from path import Path
 
 from gutenberg2zim.checkdeps import check_dependencies
 from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger
@@ -90,7 +89,7 @@ def main():
         arguments.get("--rdf-url")
         or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
     )
-    dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache")
+    dl_cache = arguments.get("--dl-folder") or "dl-cache"
     books_csv = arguments.get("--books") or ""
     zim_title = arguments.get("--zim-title")
     zim_desc = arguments.get("--zim-desc")
@@ -209,7 +208,7 @@ def f(x):
         if do_zim:
             logger.info("BUILDING ZIM dynamically")
             build_zimfile(
-                output_folder=Path(one_lang_one_zim_folder or ".").abspath(),
+                output_folder=Path(one_lang_one_zim_folder or ".").resolve(),
                 download_cache=dl_cache,
                 concurrency=concurrency,
                 languages=zim_lang,

diff --git a/src/gutenberg2zim/export.py b/src/gutenberg2zim/export.py
@@ -1,6 +1,6 @@
 import json
 import os
-import pathlib
+from pathlib import Path
 import shutil
 import tempfile
 import traceback
@@ -11,7 +11,6 @@
 import bs4
 from bs4 import BeautifulSoup
 from jinja2 import Environment, PackageLoader
-from path import Path
 from schedule import every
 from six import text_type
 from zimscraperlib.image.transformation import resize_image
@@ -95,7 +94,7 @@ def save_bs_output(soup, fpath, encoding=UTF8):
 
 
 def tmpl_path():
-    return os.path.join(Path(gutenberg2zim.__file__).parent, "templates")
+    return Path(gutenberg2zim.__file__).parent / "templates"
 
 
 def get_list_of_all_languages():
@@ -105,8 +104,8 @@ def get_list_of_all_languages():
 def export_illustration():
     logger.info("Adding illustration")
 
-    src_illus_fpath = pathlib.Path(tmpl_path(), "favicon.png")
-    tmp_illus_fpath = pathlib.Path(TMP_FOLDER_PATH, "illustration.png")
+    src_illus_fpath = Path(tmpl_path(), "favicon.png")
+    tmp_illus_fpath = Path(TMP_FOLDER_PATH, "illustration.png")
 
     shutil.copy(src_illus_fpath, tmp_illus_fpath)
 
@@ -152,18 +151,18 @@ def export_skeleton(
         "datatables",
         "fonts",
     ):
-        src = os.path.join(src_folder, fname)
+        src = Path(src_folder) / fname
 
         # recursively add our assets, at a path identical to position in repo
-        assets_root = pathlib.Path(src)
+        assets_root = Path(src)
         if assets_root.is_file():
             Global.add_item_for(path=fname, fpath=assets_root)
         else:
             for fpath in assets_root.glob("**/*"):
                 if not fpath.is_file() or fpath.name == "l10n.js":
                     continue
                 path = str(fpath.relative_to(src))
-                Global.add_item_for(path=os.path.join(fname, path), fpath=fpath)
+                Global.add_item_for(path=str(Path(fname) / path), fpath=fpath)
 
     # export homepage
     tpl_path = "Home.html"
@@ -273,7 +272,7 @@ def nb_by_fmt(fmt):
     def dlb(b):
         export_book(
             b,
-            book_dir=pathlib.Path(download_cache).joinpath(str(b.id)),
+            book_dir=Path(download_cache).joinpath(str(b.id)),
             formats=formats,
             books=books,
             project_id=project_id,
@@ -711,23 +710,23 @@ def optimize_epub(src, dst):
 
         remove_cover = False
         for fname in zipped_files:
-            fnp = os.path.join(tmpd, fname)
-            if Path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):
+            fnp = Path(tmpd) / fname
+            if Path(fname).suffix in (".png", ".jpeg", ".jpg", ".gif"):
                 # special case to remove ugly cover
                 if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                     zipped_files.remove(fname)
                     remove_cover = True
                 else:
-                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)
+                    optimize_image(Path(fnp), Path(fnp), force=True)
 
-            if Path(fname).ext in (".htm", ".html"):
+            if Path(fname).suffix in (".htm", ".html"):
                 html_content, _ = read_file(fnp)
                 html = update_html_for_static(
                     book=book, html_content=html_content, formats=formats, epub=True
                 )
                 save_bs_output(html, fnp, UTF8)
 
-            if Path(fname).ext == ".ncx":
+            if Path(fname).suffix == ".ncx":
                 pattern = "*** START: FULL LICENSE ***"
                 ncx, _ = read_file(fnp)
                 soup = BeautifulSoup(ncx, "lxml-xml")
@@ -744,11 +743,15 @@ def optimize_epub(src, dst):
         # delete {id}/cover.jpg if exist and update {id}/content.opf
         if remove_cover:
             # remove cover
-            Path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()
+            file_path = Path(tmpd) / text_type(book.id) / "cover.jpg"
+            try:
+                file_path.unlink()
+            except FileNotFoundError:
+                pass
 
             soup = None
-            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
-            if os.path.exists(opff):
+            opff = Path(tmpd) / text_type(book.id) / "content.opf"
+            if Path(opff).exists():
                 opff_content, _ = read_file(opff)
                 soup = BeautifulSoup(opff_content, "lxml-xml")
 
@@ -761,7 +764,7 @@ def optimize_epub(src, dst):
         # bundle epub as zip
         zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)
 
-        Path(tmpd).rmtree_p()
+        shutil.rmtree(tmpd)
 
     def handle_companion_file(
         fname,
@@ -821,7 +824,7 @@ def handle_companion_file(
                     as_ext=".zip",
                 )
             else:
-                Path(tmp_epub.name).move(str(dst))
+                tmp_epub.rename(Path(dst) / tmp_epub.name)
                 Global.add_item_for(path=dstfname, fpath=dst)
                 if s3_storage:
                     upload_to_cache(

diff --git a/src/gutenberg2zim/rdf.py b/src/gutenberg2zim/rdf.py
@@ -1,8 +1,7 @@
 import os
-import pathlib
 import re
 import tarfile
-
+from pathlib import Path
 import peewee
 from bs4 import BeautifulSoup
 
@@ -18,7 +17,7 @@
 
 def get_rdf_fpath():
     fname = "rdf-files.tar.bz2"
-    fpath = pathlib.Path(fname).resolve()
+    fpath = Path(fname).resolve()
     return fpath
 
 
@@ -38,7 +37,7 @@ def parse_and_fill(rdf_path, only_books):
     rdf_tarfile = tarfile.open(name=rdf_path, mode="r|bz2")
 
     for rdf_member in rdf_tarfile:
-        rdf_member_path = pathlib.Path(rdf_member.name)
+        rdf_member_path = Path(rdf_member.name)
 
         # skip books outside of requested list
         if (
@@ -297,9 +296,10 @@ def get_formatted_number(num):
     nums = [f"{i:0=5d}" for i in range(21000, 40000)]
     for num in nums:
         print(num)  # noqa: T201
-        curd = os.path.dirname(os.path.realpath(__file__))
-        rdf = os.path.join(curd, "..", "rdf-files", num, "pg" + num + ".rdf")
-        if os.path.isfile(rdf):
+        curd = Path(__file__).resolve().parent
+        rdf = curd.parent / "rdf-files" / num / f"pg{num}.rdf"
+
+        if rdf.is_file():
             data = ""
             with open(rdf) as f:
                 data = f.read()

diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py
@@ -1,6 +1,7 @@
 import os
 import urllib.parse as urlparse
 from collections import defaultdict
+from pathlib import Path
 
 from gutenberg2zim.constants import TMP_FOLDER_PATH, logger
 from gutenberg2zim.database import Book, BookFormat, Url
@@ -41,14 +42,12 @@ def build(self):
         """
         if self.base == self.BASE_ONE:
             if int(self.b_id) > 10:  # noqa: PLR2004
-                base_url = os.path.join(
-                    os.path.join(*list(str(self.b_id))[:-1]), str(self.b_id)
-                )
+                base_url = Path(*list(str(self.b_id)[:-1])) / str(self.b_id)
             else:
-                base_url = os.path.join(os.path.join("0", str(self.b_id)))
-            url = os.path.join(self.base, base_url)
+                base_url = Path("0") / str(self.b_id)
+            url = Path(self.base) / base_url
         elif self.base == self.BASE_TWO:
-            url = os.path.join(self.base, str(self.b_id))
+            url = Path(self.base) / str(self.b_id)
         elif self.base == self.BASE_THREE:
             url = self.base
         return url  # type: ignore
@@ -146,9 +145,9 @@ def build_epub(files):
         return []
 
     name = "".join(["pg", b_id])
-    url = os.path.join(u.build(), name + ".epub")
-    url_images = os.path.join(u.build(), name + "-images.epub")
-    url_noimages = os.path.join(u.build(), name + "-noimages.epub")
+    url = Path(u.build()) / f"{name}.epub"
+    url_images = Path(u.build()) / f"{name}-images.epub"
+    url_noimages = Path(u.build()) / f"{name}-noimages.epub"
     urls.extend([url, url_images, url_noimages])
     return urls
 
@@ -172,13 +171,13 @@ def build_pdf(files):
 
     for i in files:
         if "images" not in i["name"]:
-            url = os.path.join(u.build(), i["name"])
+            url = Path(u.build()) / i["name"]
             urls.append(url)
 
-    url_dash1 = os.path.join(u1.build(), b_id + "-" + "pdf" + ".pdf")
-    url_dash = os.path.join(u.build(), b_id + "-" + "pdf" + ".pdf")
-    url_normal = os.path.join(u.build(), b_id + ".pdf")
-    url_pg = os.path.join(u.build(), "pg" + b_id + ".pdf")
+    url_dash1 = Path(u1.build()) / f"{b_id}-pdf.pdf"
+    url_dash = Path(u.build()) / f"{b_id}-pdf.pdf"
+    url_normal = Path(u.build()) / f"{b_id}.pdf"
+    url_pg = Path(u.build()) / f"pg{b_id}.pdf"
 
     urls.extend([url_dash, url_normal, url_pg, url_dash1])
     return list(set(urls))
@@ -199,17 +198,17 @@ def build_html(files):
 
     if all(["-h.html" not in file_names, "-h.zip" in file_names]):
         for i in files:
-            url = os.path.join(u.build(), i["name"])
+            url = Path(u.build()) / i["name"]
             urls.append(url)
 
-    url_zip = os.path.join(u.build(), b_id + "-h" + ".zip")
-    # url_utf8 = os.path.join(u.build(), b_id + '-8' + '.zip')
-    url_html = os.path.join(u.build(), b_id + "-h" + ".html")
-    url_htm = os.path.join(u.build(), b_id + "-h" + ".htm")
+    url_zip = Path(u.build()) / f"{b_id}-h.zip"
+    # url_utf8 = Path(u.build()) / f"{b_id}-8.zip"
+    url_html = Path(u.build()) / f"{b_id}-h.html"
+    url_htm = Path(u.build()) / f"{b_id}-h.htm"
 
     u.with_base(UrlBuilder.BASE_TWO)
     name = "".join(["pg", b_id])
-    html_utf8 = os.path.join(u.build(), name + ".html.utf8")
+    html_utf8 = Path(u.build()) / f"{name}.html.utf8"
 
     u.with_base(UrlBuilder.BASE_THREE)
     file_index = index_of_substring(files, ["html", "htm"])
@@ -220,7 +219,7 @@ def build_html(files):
     etext_names = [f"{i:0=2d}" for i in etext_nums]
     etext_urls = []
     for i in etext_names:
-        etext_urls.append(os.path.join(u.build() + i, file_name))
+        etext_urls.append(Path(u.build(), i, file_name))
 
     urls.extend([url_zip, url_htm, url_html, html_utf8])
     urls.extend(etext_urls)