From ac101f320889629d520e4fbac8b6da6ea50575d2 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:40:13 +0000
Subject: [PATCH] Do not insert all RSYNC paths in database

---
 src/gutenberg2zim/entrypoint.py |  2 +-
 src/gutenberg2zim/urls.py       | 26 ++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py
index cd13f82..4ee145f 100755
--- a/src/gutenberg2zim/entrypoint.py
+++ b/src/gutenberg2zim/entrypoint.py
@@ -178,7 +178,7 @@ def f(x):
         logger.info(f"PARSING rdf-files in {rdf_path}")
         parse_and_fill(rdf_path=rdf_path, only_books=books)
         logger.info("Add possible url to db")
-        setup_urls(force=force)
+        setup_urls(force=force, books=books)
 
     if do_download:
         logger.info("DOWNLOADING ebooks from mirror using filters")
diff --git a/src/gutenberg2zim/urls.py b/src/gutenberg2zim/urls.py
index 60fa7ab..d34c0c4 100644
--- a/src/gutenberg2zim/urls.py
+++ b/src/gutenberg2zim/urls.py
@@ -226,7 +226,7 @@ def build_html(files):
     return list(set(urls))
 
 
-def setup_urls(force):
+def setup_urls(force, books):
     file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}")
 
     if file_with_url.exists() and not force:
@@ -260,10 +260,32 @@ def setup_urls(force):
     qry.execute()
 
     logger.info("\tAppending urls in DB from rsync result")
-    # strip rsync file to only contain relative path
+    count_dir = count_old = count_added = count_processed = 0
     with open(file_with_url, errors="replace") as src:
+        # show progress in debug mode, we expect about 5.4M lines as of early 2024
+        if count_processed and count_processed % 100000 == 0:
+            logger.debug(f"\t{count_processed} rsync results processed")
         for line in src.readlines():
+            count_processed += 1
+            # ignore all directory entries
+            if line.startswith("d"):
+                count_dir += 1
+                continue
+            # ignore all entries in an /old/ subfolder
+            if "/old/" in line:
+                count_old += 1
+                continue
+            # take into account the book selection which might have been passed
+            if books:
+                if not any(f"/{book}/" in line for book in books):
+                    continue
+            # strip rsync file to only contain relative path
             Url.create(url=line[start_rel_path_idx:].strip())  # type: ignore
+            count_added += 1
+    logger.info(
+        f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, "
+        f"{count_old} old stuff ignored, {count_processed} lines processed)"
+    )
 
 
 if __name__ == "__main__":