From ac101f320889629d520e4fbac8b6da6ea50575d2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 5 Mar 2024 15:40:13 +0000 Subject: [PATCH] Do not insert all RSYNC paths in database --- src/gutenberg2zim/entrypoint.py | 2 +- src/gutenberg2zim/urls.py | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py index cd13f82..4ee145f 100755 --- a/src/gutenberg2zim/entrypoint.py +++ b/src/gutenberg2zim/entrypoint.py @@ -178,7 +178,7 @@ def f(x): logger.info(f"PARSING rdf-files in {rdf_path}") parse_and_fill(rdf_path=rdf_path, only_books=books) logger.info("Add possible url to db") - setup_urls(force=force) + setup_urls(force=force, books=books) if do_download: logger.info("DOWNLOADING ebooks from mirror using filters") diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py index 60fa7ab..d34c0c4 100644 --- a/src/gutenberg2zim/urls.py +++ b/src/gutenberg2zim/urls.py @@ -226,7 +226,7 @@ def build_html(files): return list(set(urls)) -def setup_urls(force): +def setup_urls(force, books): file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}") if file_with_url.exists() and not force: @@ -260,10 +260,32 @@ def setup_urls(force): qry.execute() logger.info("\tAppending urls in DB from rsync result") - # strip rsync file to only contain relative path + count_dir = count_old = count_added = count_processed = 0 with open(file_with_url, errors="replace") as src: + # show progress in debug mode, we expect about 5.4M lines as of early 2024 + if count_processed and count_processed % 100000 == 0: + logger.debug(f"\t{count_processed} rsync results processed") for line in src.readlines(): + count_processed += 1 + # ignore all directory entries + if line.startswith("d"): + count_dir += 1 + continue + # ignore all entries in an /old/ subfolder + if "/old/" in line: + count_old += 1 + continue + # take into account the book selection which might have been passed + if books: + if not any(f"/{book}/" in line for book in books): + continue + # strip rsync file to only contain relative path Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore + count_added += 1 + logger.info( + f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, " + f"{count_old} old stuff ignored, {count_processed} lines processed)" + ) if __name__ == "__main__":