diff --git a/src/gutenberg2zim/download.py b/src/gutenberg2zim/download.py index 3a38c89..6f5a48e 100644 --- a/src/gutenberg2zim/download.py +++ b/src/gutenberg2zim/download.py @@ -1,5 +1,5 @@ import os -import pathlib +from pathlib import Path import shutil import tempfile import zipfile @@ -9,7 +9,6 @@ import apsw import backoff from kiwixstorage import KiwixStorage -from path import Path from gutenberg2zim.constants import TMP_FOLDER, logger from gutenberg2zim.database import Book, BookFormat @@ -36,15 +35,15 @@ # return False -def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path): +def handle_zipped_epub(zippath, book, dst_dir: Path): def clfn(fn): - return os.path.join(*os.path.split(fn)[1:]) + return Path(*Path(fn).parts[1:]) def is_safe(fname): fname = ensure_unicode(clfn(fname)) - if Path(fname).basename() == fname: + if Path(fname).name == fname: return True - return fname == os.path.join("images", Path(fname).splitpath()[-1]) + return fname == Path("images") / Path(fname).name zipped_files = [] # create temp directory to extract to @@ -53,7 +52,7 @@ def is_safe(fname): with zipfile.ZipFile(zippath, "r") as zf: # check that there is no insecure data (absolute names) if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]): - Path(tmpd).rmtree_p() + shutil.rmtree(tmpd) return False # zipped_files = [clfn(fn) for fn in zf.namelist()] zipped_files = zf.namelist() @@ -73,12 +72,12 @@ def is_safe(fname): # move all extracted files to proper locations for zipped_file in zipped_files: # skip folders - if not Path(zipped_file).ext: + if not Path(zipped_file).suffix: continue - src = os.path.join(tmpd, zipped_file) - if os.path.exists(src): - fname = Path(zipped_file).basename() + src = Path(tmpd) / zipped_file + if Path(src).exists(): + fname = Path(zipped_file).name if fname.endswith(".html") or fname.endswith(".htm"): if mhtml: @@ -91,7 +90,7 @@ def is_safe(fname): else: dst = dst_dir.joinpath(f"{book.id}_{fname}") try: - Path(src).move(str(dst)) + Path(src).rename(dst) except Exception as e: import traceback @@ -102,7 +101,8 @@ def is_safe(fname): # delete temp directory and zipfile if Path(zippath).exists(): os.unlink(zippath) - Path(tmpd).rmtree_p() + shutil.rmtree(tmpd) + def download_book( @@ -124,7 +124,7 @@ def download_book( if "html" not in formats: formats.append("html") - book_dir = pathlib.Path(download_cache).joinpath(str(book.id)) + book_dir = Path(download_cache).joinpath(str(book.id)) optimized_dir = book_dir.joinpath("optimized") unoptimized_dir = book_dir.joinpath("unoptimized") unsuccessful_formats = [] @@ -372,7 +372,7 @@ def download_all_books( ) # ensure dir exist - Path(download_cache).mkdir_p() + Path(download_cache).mkdir(parents=True, exist_ok=True) def backoff_busy_error_hdlr(details): logger.warning( diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py index eb111bd..4322c31 100755 --- a/src/gutenberg2zim/entrypoint.py +++ b/src/gutenberg2zim/entrypoint.py @@ -1,8 +1,7 @@ import os import sys - +from pathlib import Path from docopt import docopt -from path import Path from gutenberg2zim.checkdeps import check_dependencies from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger @@ -90,7 +89,7 @@ def main(): arguments.get("--rdf-url") or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2" ) - dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache") + dl_cache = arguments.get("--dl-folder") or "dl-cache" books_csv = arguments.get("--books") or "" zim_title = arguments.get("--zim-title") zim_desc = arguments.get("--zim-desc") @@ -209,7 +208,7 @@ def f(x): if do_zim: logger.info("BUILDING ZIM dynamically") build_zimfile( - output_folder=Path(one_lang_one_zim_folder or ".").abspath(), + output_folder=Path(one_lang_one_zim_folder or ".").resolve(), download_cache=dl_cache, concurrency=concurrency, languages=zim_lang, diff --git a/src/gutenberg2zim/export.py b/src/gutenberg2zim/export.py index 20b0e63..de266ae 100644 --- a/src/gutenberg2zim/export.py +++ b/src/gutenberg2zim/export.py @@ -1,6 +1,6 @@ import json import os -import pathlib +from pathlib import Path import shutil import tempfile import traceback @@ -11,7 +11,6 @@ import bs4 from bs4 import BeautifulSoup from jinja2 import Environment, PackageLoader -from path import Path from schedule import every from six import text_type from zimscraperlib.image.transformation import resize_image @@ -95,7 +94,7 @@ def save_bs_output(soup, fpath, encoding=UTF8): def tmpl_path(): - return os.path.join(Path(gutenberg2zim.__file__).parent, "templates") + return Path(gutenberg2zim.__file__).parent / "templates" def get_list_of_all_languages(): @@ -105,8 +104,8 @@ def get_list_of_all_languages(): def export_illustration(): logger.info("Adding illustration") - src_illus_fpath = pathlib.Path(tmpl_path(), "favicon.png") - tmp_illus_fpath = pathlib.Path(TMP_FOLDER_PATH, "illustration.png") + src_illus_fpath = Path(tmpl_path(), "favicon.png") + tmp_illus_fpath = Path(TMP_FOLDER_PATH, "illustration.png") shutil.copy(src_illus_fpath, tmp_illus_fpath) @@ -152,10 +151,10 @@ def export_skeleton( "datatables", "fonts", ): - src = os.path.join(src_folder, fname) + src = Path(src_folder) / fname # recursively add our assets, at a path identical to position in repo - assets_root = pathlib.Path(src) + assets_root = Path(src) if assets_root.is_file(): Global.add_item_for(path=fname, fpath=assets_root) else: @@ -163,7 +162,7 @@ def export_skeleton( if not fpath.is_file() or fpath.name == "l10n.js": continue path = str(fpath.relative_to(src)) - Global.add_item_for(path=os.path.join(fname, path), fpath=fpath) + Global.add_item_for(path=str(Path(fname) / path), fpath=fpath) # export homepage tpl_path = "Home.html" @@ -273,7 +272,7 @@ def nb_by_fmt(fmt): def dlb(b): export_book( b, - book_dir=pathlib.Path(download_cache).joinpath(str(b.id)), + book_dir=Path(download_cache).joinpath(str(b.id)), formats=formats, books=books, project_id=project_id, @@ -711,23 +710,23 @@ def optimize_epub(src, dst): remove_cover = False for fname in zipped_files: - fnp = os.path.join(tmpd, fname) - if Path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"): + fnp = Path(tmpd) / fname + if Path(fname).suffix in (".png", ".jpeg", ".jpg", ".gif"): # special case to remove ugly cover if fname.endswith("cover.jpg") and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: - optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True) + optimize_image(Path(fnp), Path(fnp), force=True) - if Path(fname).ext in (".htm", ".html"): + if Path(fname).suffix in (".htm", ".html"): html_content, _ = read_file(fnp) html = update_html_for_static( book=book, html_content=html_content, formats=formats, epub=True ) save_bs_output(html, fnp, UTF8) - if Path(fname).ext == ".ncx": + if Path(fname).suffix == ".ncx": pattern = "*** START: FULL LICENSE ***" ncx, _ = read_file(fnp) soup = BeautifulSoup(ncx, "lxml-xml") @@ -744,11 +743,15 @@ def optimize_epub(src, dst): # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover - Path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p() + file_path = Path(tmpd) / text_type(book.id) / "cover.jpg" + try: + file_path.unlink() + except FileNotFoundError: + pass soup = None - opff = os.path.join(tmpd, text_type(book.id), "content.opf") - if os.path.exists(opff): + opff = Path(tmpd) / text_type(book.id) / "content.opf" + if Path(opff).exists(): opff_content, _ = read_file(opff) soup = BeautifulSoup(opff_content, "lxml-xml") @@ -761,7 +764,7 @@ def optimize_epub(src, dst): # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) - Path(tmpd).rmtree_p() + shutil.rmtree(tmpd) def handle_companion_file( fname, @@ -821,7 +824,7 @@ def handle_companion_file( as_ext=".zip", ) else: - Path(tmp_epub.name).move(str(dst)) + tmp_epub.rename(Path(dst) / tmp_epub.name) Global.add_item_for(path=dstfname, fpath=dst) if s3_storage: upload_to_cache( diff --git a/src/gutenberg2zim/rdf.py b/src/gutenberg2zim/rdf.py index ef29ee8..69c338e 100644 --- a/src/gutenberg2zim/rdf.py +++ b/src/gutenberg2zim/rdf.py @@ -1,8 +1,7 @@ import os -import pathlib import re import tarfile - +from pathlib import Path import peewee from bs4 import BeautifulSoup @@ -18,7 +17,7 @@ def get_rdf_fpath(): fname = "rdf-files.tar.bz2" - fpath = pathlib.Path(fname).resolve() + fpath = Path(fname).resolve() return fpath @@ -38,7 +37,7 @@ def parse_and_fill(rdf_path, only_books): rdf_tarfile = tarfile.open(name=rdf_path, mode="r|bz2") for rdf_member in rdf_tarfile: - rdf_member_path = pathlib.Path(rdf_member.name) + rdf_member_path = Path(rdf_member.name) # skip books outside of requested list if ( @@ -297,9 +296,10 @@ def get_formatted_number(num): nums = [f"{i:0=5d}" for i in range(21000, 40000)] for num in nums: print(num) # noqa: T201 - curd = os.path.dirname(os.path.realpath(__file__)) - rdf = os.path.join(curd, "..", "rdf-files", num, "pg" + num + ".rdf") - if os.path.isfile(rdf): + curd = Path(__file__).resolve().parent + rdf = curd.parent / "rdf-files" / num / f"pg{num}.rdf" + + if rdf.is_file(): data = "" with open(rdf) as f: data = f.read() diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py index deada12..407a948 100644 --- a/src/gutenberg2zim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -1,6 +1,7 @@ import os import urllib.parse as urlparse from collections import defaultdict +from pathlib import Path from gutenberg2zim.constants import TMP_FOLDER_PATH, logger from gutenberg2zim.database import Book, BookFormat, Url @@ -41,14 +42,12 @@ def build(self): """ if self.base == self.BASE_ONE: if int(self.b_id) > 10: # noqa: PLR2004 - base_url = os.path.join( - os.path.join(*list(str(self.b_id))[:-1]), str(self.b_id) - ) + base_url = Path(*list(str(self.b_id)[:-1])) / str(self.b_id) else: - base_url = os.path.join(os.path.join("0", str(self.b_id))) - url = os.path.join(self.base, base_url) + base_url = Path("0") / str(self.b_id) + url = Path(self.base) / base_url elif self.base == self.BASE_TWO: - url = os.path.join(self.base, str(self.b_id)) + url = Path(self.base) / str(self.b_id) elif self.base == self.BASE_THREE: url = self.base return url # type: ignore @@ -146,9 +145,9 @@ def build_epub(files): return [] name = "".join(["pg", b_id]) - url = os.path.join(u.build(), name + ".epub") - url_images = os.path.join(u.build(), name + "-images.epub") - url_noimages = os.path.join(u.build(), name + "-noimages.epub") + url = Path(u.build()) / f"{name}.epub" + url_images = Path(u.build()) / f"{name}-images.epub" + url_noimages = Path(u.build()) / f"{name}-noimages.epub" urls.extend([url, url_images, url_noimages]) return urls @@ -172,13 +171,13 @@ def build_pdf(files): for i in files: if "images" not in i["name"]: - url = os.path.join(u.build(), i["name"]) + url = Path(u.build()) / i["name"] urls.append(url) - url_dash1 = os.path.join(u1.build(), b_id + "-" + "pdf" + ".pdf") - url_dash = os.path.join(u.build(), b_id + "-" + "pdf" + ".pdf") - url_normal = os.path.join(u.build(), b_id + ".pdf") - url_pg = os.path.join(u.build(), "pg" + b_id + ".pdf") + url_dash1 = Path(u1.build()) / f"{b_id}-pdf.pdf" + url_dash = Path(u.build()) / f"{b_id}-pdf.pdf" + url_normal = Path(u.build()) / f"{b_id}.pdf" + url_pg = Path(u.build()) / f"pg{b_id}.pdf" urls.extend([url_dash, url_normal, url_pg, url_dash1]) return list(set(urls)) @@ -199,17 +198,17 @@ def build_html(files): if all(["-h.html" not in file_names, "-h.zip" in file_names]): for i in files: - url = os.path.join(u.build(), i["name"]) + url = Path(u.build()) / i["name"] urls.append(url) - url_zip = os.path.join(u.build(), b_id + "-h" + ".zip") - # url_utf8 = os.path.join(u.build(), b_id + '-8' + '.zip') - url_html = os.path.join(u.build(), b_id + "-h" + ".html") - url_htm = os.path.join(u.build(), b_id + "-h" + ".htm") + url_zip = Path(u.build()) / f"{b_id}-h.zip" + # url_utf8 = Path(u.build()) / f"{b_id}-8.zip" + url_html = Path(u.build()) / f"{b_id}-h.html" + url_htm = Path(u.build()) / f"{b_id}-h.htm" u.with_base(UrlBuilder.BASE_TWO) name = "".join(["pg", b_id]) - html_utf8 = os.path.join(u.build(), name + ".html.utf8") + html_utf8 = Path(u.build()) / f"{name}.html.utf8" u.with_base(UrlBuilder.BASE_THREE) file_index = index_of_substring(files, ["html", "htm"]) @@ -220,7 +219,7 @@ def build_html(files): etext_names = [f"{i:0=2d}" for i in etext_nums] etext_urls = [] for i in etext_names: - etext_urls.append(os.path.join(u.build() + i, file_name)) + etext_urls.append(Path(u.build(), i, file_name)) urls.extend([url_zip, url_htm, url_html, html_utf8]) urls.extend(etext_urls) diff --git a/src/gutenberg2zim/utils.py b/src/gutenberg2zim/utils.py index 21bbabe..f1ddad0 100644 --- a/src/gutenberg2zim/utils.py +++ b/src/gutenberg2zim/utils.py @@ -9,7 +9,7 @@ import chardet import requests import six -from path import Path +from pathlib import Path from zimscraperlib.download import save_large_file from gutenberg2zim.constants import logger @@ -167,7 +167,7 @@ def is_bad_cover(fpath): bad_sizes = [19263] bad_sums = ["a059007e7a2e86f2bf92e4070b3e5c73"] - if Path(fpath).size not in bad_sizes: + if Path(fpath).stat().st_size not in bad_sizes: return False return md5sum(fpath) in bad_sums @@ -204,7 +204,7 @@ def save_file(content, fpath, encoding=UTF8): def zip_epub(epub_fpath, root_folder, fpaths): with zipfile.ZipFile(epub_fpath, "w", zipfile.ZIP_DEFLATED) as zf: for fpath in fpaths: - zf.write(os.path.join(root_folder, fpath), fpath) + zf.write(Path(root_folder, fpath), fpath) def ensure_unicode(v): diff --git a/src/gutenberg2zim/zim.py b/src/gutenberg2zim/zim.py index 1e163e7..bda7e4f 100644 --- a/src/gutenberg2zim/zim.py +++ b/src/gutenberg2zim/zim.py @@ -1,6 +1,6 @@ import datetime -from path import Path +from pathlib import Path from peewee import fn from gutenberg2zim.constants import logger