Skip to content

Commit

Permalink
Merge pull request #150 from sbesson/memoregenerator_setId_order
Browse files Browse the repository at this point in the history
Improve fileset distribution during the memo file regeneration
  • Loading branch information
chris-allan authored Jan 17, 2025
2 parents 7b085fb + 68cfcfc commit aa533d6
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 36 deletions.
42 changes: 24 additions & 18 deletions src/dist/memo_regenerator.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
COPY (SELECT * FROM (
SELECT image.id AS imageId,
pixels.id AS pixelsId,
image.series,
pixelstype.value AS pixelstype,
pixels.sizeX,
pixels.sizeY,
pixels.sizeZ,
pixels.sizeC,
pixels.sizeT,
format.value,
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
FROM fileset
JOIN image ON fileset.id = image.fileset
JOIN pixels ON image.id = pixels.image
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
JOIN format ON image.format = format.id
) AS rank WHERE rank.rank = 1) TO STDOUT CSV;
COPY (SELECT * FROM (
SELECT image.id AS imageId,
pixels.id AS pixelsId,
image.series,
pixelstype.value AS pixelstype,
pixels.sizeX,
pixels.sizeY,
pixels.sizeZ,
pixels.sizeC,
pixels.sizeT,
format.value,
e2.time - e1.time AS setId,
rank() OVER (PARTITION BY fileset.id ORDER BY image.id) AS rank
FROM fileset
JOIN image ON fileset.id = image.fileset
JOIN pixels ON image.id = pixels.image
JOIN pixelstype ON pixels.pixelstype = pixelstype.id
JOIN format ON image.format = format.id
JOIN event e2 on image.creation_id=e2.id
JOIN filesetjoblink on filesetjoblink.parent=fileset.id
JOIN job on filesetjoblink.child=job.id
JOIN uploadjob on job.id=uploadjob.job_id
JOIN event e1 on job.update_id=e1.id
) AS query WHERE query.rank = 1 ORDER BY query.setId desc) TO STDOUT CSV;
41 changes: 23 additions & 18 deletions src/dist/regen-memo-files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,42 @@
usage() {
echo "Usage:"
echo "$0 [OPTIONS]"
echo "Regenerates bioformats memofiles"
echo "Regenerates Bio-Formats memo files in parallel"
echo
echo "This utility queries the OMERO database for a list of filesets, splits the output"
echo "into several input files and runs the memoregenerator utility using GNU parallel."
echo
echo " OPTIONS:"
echo " --help display usage and exit"
echo " --db database connection string"
echo " --jobs max number of jobs to parallelize"
echo " --memoizer-home Location of image-region-ms"
echo " --batch-size Maximum number of entries in each input file sent to parallel (default: 500)"
echo " --cache-options Memofile cache options [/path/to/dir | inplace] (required)"
echo " --csv Bypass sql query and use this csv for image list"
echo " --db Database connection string"
echo " --force-image-regen Force regeneration of image list even if it exists already"
echo " --help Display usage and exit"
echo " --jobs Maximum number of jobs to parallelize (default: number of processing units available)"
echo " --memoizer-home Location of image-region micro-service (default: current directory)"
echo " --no-ask Do not ask for confirmation"
echo " --no-wait Do not wait to start generating -- DO IT NOW"
echo " --cache-options Memofile cache options [/path/to/dir | inplace]"
echo " --batch-size # of image files to split list into"
echo " --csv Bypass sql query and use this csv for image list"
echo
echo "Example:"
echo " $0 --db postgresql://user:pass@host:port/db --jobs [12|max] --memoizer-home /opt/omero/OMERO.ms-image-region.current --cache-options /path/to/dir"
echo "Examples:"
echo " Regenerate memo files using the current cache directory and all available CPUs"
echo " $0 --cache-options inplace"
echo " Regenerate memo files offline using a secondary cache directory and 4 CPUs"
echo " $0 --jobs 4 --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
echo " Regenerate memo files offline using a secondary cache directory, all available CPUs and a database connection string"
echo " $0 --db postgresql://user:pass@host:port/db --cache-options /OMERO/BioFormatsCache.$( date "+%Y%m%d" )"
exit $1
}

run_split_parallel_os_dep() {
set -x
export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
CENTOS_VERSION=$(cat /etc/centos-release |cut -f 3 -d' '|cut -d. -f 1)
cd rslt.${DATESTR}
split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
PARALLEL_OPTS="error"
if [ "${CENTOS_VERSION}" = "6" ]; then
PARALLEL_OPTS="--halt 2 --gnu --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --result . ${DRYRUN}"
else
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
fi
# Split the CSV file into N * JOBS files of at most BATCH_SIZE entries using round-robin distribution
N=$(wc -l ${FULL_CSV} | awk '{print $1}')
NFILES=$(( (($N - 1) / ($BATCH_SIZE * $JOBS) + 1 ) * $JOBS ))
split -a 3 -n r/$NFILES ${FULL_CSV} -d input.
PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
set -x
/usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \
${MEMOIZER_HOME}/bin/memoregenerator \
Expand Down

0 comments on commit aa533d6

Please sign in to comment.