Skip to content

Commit

Permalink
Do not insert all RSYNC paths in database
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Mar 7, 2024
1 parent 074631c commit ac101f3
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/gutenberg2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def f(x):
logger.info(f"PARSING rdf-files in {rdf_path}")
parse_and_fill(rdf_path=rdf_path, only_books=books)
logger.info("Add possible url to db")
setup_urls(force=force)
setup_urls(force=force, books=books)

Check warning on line 181 in src/gutenberg2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/entrypoint.py#L181

Added line #L181 was not covered by tests

if do_download:
logger.info("DOWNLOADING ebooks from mirror using filters")
Expand Down
26 changes: 24 additions & 2 deletions src/gutenberg2zim/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def build_html(files):
return list(set(urls))


def setup_urls(force):
def setup_urls(force, books):

Check warning on line 229 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L229

Added line #L229 was not covered by tests
file_with_url = TMP_FOLDER_PATH.joinpath(f"file_on_{UrlBuilder.SERVER_NAME}")

if file_with_url.exists() and not force:
Expand Down Expand Up @@ -260,10 +260,32 @@ def setup_urls(force):
qry.execute()

logger.info("\tAppending urls in DB from rsync result")
# strip rsync file to only contain relative path
count_dir = count_old = count_added = count_processed = 0

Check warning on line 263 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L263

Added line #L263 was not covered by tests
with open(file_with_url, errors="replace") as src:
# show progress in debug mode, we expect about 5.4M lines as of early 2024
if count_processed and count_processed % 100000 == 0:
logger.debug(f"\t{count_processed} rsync results processed")

Check warning on line 267 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L267

Added line #L267 was not covered by tests
for line in src.readlines():
count_processed += 1

Check warning on line 269 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L269

Added line #L269 was not covered by tests
# ignore all directory entries
if line.startswith("d"):
count_dir += 1
continue

Check warning on line 273 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L272-L273

Added lines #L272 - L273 were not covered by tests
# ignore all entries in an /old/ subfolder
if "/old/" in line:
count_old += 1
continue

Check warning on line 277 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L276-L277

Added lines #L276 - L277 were not covered by tests
# take into account the book selection which might have been passed
if books:
if not any(f"/{book}/" in line for book in books):
continue

Check warning on line 281 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L281

Added line #L281 was not covered by tests
# strip rsync file to only contain relative path
Url.create(url=line[start_rel_path_idx:].strip()) # type: ignore
count_added += 1
logger.info(

Check warning on line 285 in src/gutenberg2zim/urls.py

View check run for this annotation

Codecov / codecov/patch

src/gutenberg2zim/urls.py#L284-L285

Added lines #L284 - L285 were not covered by tests
f"\tDB is ready, {count_added} URLs have been added ({count_dir} dirs ignored, "

Check notice on line 286 in src/gutenberg2zim/urls.py

View check run for this annotation

codefactor.io / CodeFactor

src/gutenberg2zim/urls.py#L229-L286

Complex Method
f"{count_old} old stuff ignored, {count_processed} lines processed)"
)


if __name__ == "__main__":
Expand Down

0 comments on commit ac101f3

Please sign in to comment.