Split SQL into $JOBS input files using round robin distribution

The using of the chunks option with round robin distribution should create as many input files as there will be jobs and ensure the projected regeneration times are as equally distributed as possible
glencoesoftware · Sep 10, 2024 · 442341d · 442341d
1 parent 8b3c9d7
commit 442341d
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/src/dist/regen-memo-files.sh b/src/dist/regen-memo-files.sh
@@ -45,7 +45,7 @@ run_split_parallel_os_dep() {
 set -x
   export JAVA_OPTS="-XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=rslt.${DATESTR} -Xmx2g -Dlogback.configurationFile=${MEMOIZER_HOME}/logback-memoizer.xml -Dprocessname=memoizer"
   cd rslt.${DATESTR}
-  split -a 3 -l ${BATCH_SIZE} ${FULL_CSV} -d input.
+  split -a 3 -n r/$JOBS ${FULL_CSV} -d input.
   PARALLEL_OPTS="--halt now,fail=1 --eta --jobs ${JOBS} --joblog parallel-${JOBS}cpus.log --files --use-cpus-instead-of-cores --results . ${DRYRUN}"
 set -x
   /usr/bin/time -p -o timed parallel ${PARALLEL_OPTS} \