From 5d0c140888bac6350b6b1b30e350524563efa3c7 Mon Sep 17 00:00:00 2001 From: Santiago Soler Date: Fri, 24 May 2024 12:40:53 -0700 Subject: [PATCH] Add guide on downloading files in parallel Add a new page to the user guide that provides instructions to download the same file by multiple processes that run in parallel, avoiding multiple downloads of the same file through the usage of lock files. Add `filelock` as a requirement for building the docs. --- doc/conf.py | 1 + doc/index.rst | 1 + doc/parallel-downloads.rst | 51 ++++++++++++++++++++++++++++++++++++++ env/requirements-docs.txt | 1 + environment.yml | 1 + 5 files changed, 55 insertions(+) create mode 100644 doc/parallel-downloads.rst diff --git a/doc/conf.py b/doc/conf.py index 6c7bb4de..41e1b79c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -41,6 +41,7 @@ "python": ("https://docs.python.org/3/", None), "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), "requests": ("https://requests.readthedocs.io/en/latest/", None), + "filelock": ("https://py-filelock.readthedocs.io/en/latest/", None), } # Autosummary pages will be generated by sphinx-autogen instead of sphinx-build diff --git a/doc/index.rst b/doc/index.rst index f2306777..6b0f11b7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -134,6 +134,7 @@ Are you a **scientist** or researcher? Pooch can help you too! progressbars.rst unpacking.rst decompressing.rst + parallel-downloads.rst .. toctree:: :caption: Reference diff --git a/doc/parallel-downloads.rst b/doc/parallel-downloads.rst new file mode 100644 index 00000000..bfa68af7 --- /dev/null +++ b/doc/parallel-downloads.rst @@ -0,0 +1,51 @@ +.. _paralleldownloads: + +Parallel downloads +================== + +When running :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch` on parallel +processes, Pooch will trigger multiple downloads of the same file(s). Although +there is no `race condition `_ +happening in this process, download the same file multiple time is not +desirable, it slows down the fetching process and consumes more bandwidth than +necessary. + +A solution to this problem is to create a `lock file +`_ that will allow only +one process to download the desired file, and force all the other processes to +wait until it finishes for fetching the file directly from the cache. +Lock files can be easily created through the :mod:`filelock` package. + +For example, let's create a ``download.py`` file that defines a lock file +before calling the :fun:`pooch.retrieve` function. + +.. code:: python + + # file: download.py + import pooch + import filelock + + lock = filelock.LockFile(path="foo.lock") + with lock: + file_path = pooch.retrieve( + url="https://github.com/fatiando/pooch/raw/v1.0.0/data/tiny-data.txt", + known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e", + path="my_dir", + ) + + # Perform tasks with this file using different parameters passed as argument + parameter = sys.arg[1] # get parameter from first argument + ... # perform tasks using the file and the parameter + +We can run this script in parallel using the Bash ampersand: + +.. code:: bash + + python download.py 1 & + python download.py 2 & + python download.py 3 & + +Since we are using a lock file, only one of these process will take care of the +download. The rest will wait for it to finish, and then fetch the file from the +cache. Then all further tasks that the ``download.py`` performs using the +different arguments will be run in parallel as usual. diff --git a/env/requirements-docs.txt b/env/requirements-docs.txt index adc9427a..ddb2b639 100644 --- a/env/requirements-docs.txt +++ b/env/requirements-docs.txt @@ -2,3 +2,4 @@ sphinx==7.2.* sphinx-book-theme==1.1.* sphinx-design==0.5.* +filelock diff --git a/environment.yml b/environment.yml index 8c36ebbb..c2c382a6 100644 --- a/environment.yml +++ b/environment.yml @@ -21,6 +21,7 @@ dependencies: - sphinx==7.2.* - sphinx-book-theme==1.1.* - sphinx-design==0.5.* + - filelock # Style - pathspec - black>=20.8b1