diff --git a/tutorials/data-lake/pull-dataset/README.md b/tutorials/data-lake/pull-dataset/README.md index 1d1b6277..357b4889 100644 --- a/tutorials/data-lake/pull-dataset/README.md +++ b/tutorials/data-lake/pull-dataset/README.md @@ -1 +1,16 @@ -# Pull dataset from Rucio data lake +# Interact with Rucio dataset files + +The following script assumes that all the files within a DID are present in a RSE (Rucio Storage Element), and that this RSE is accessible locally. + - A DID is compose of a scope plus a dataset names in the `SCOPE:DataSet` format. + - If the files are not present in the RSE, replicate the dataset on the desired RSE before running the script. + +Run the following bash script + +```bash +> ./rucio_dataset_files.sh + +# Example +> ./rucio_dataset_files.sh calorimeter:training_data_hdf5 calorimeter_files.txt calorimeter_symlink_dir +> cat calorimeter_files.txt +> ls -l calorimeter_symlink_dir +``` diff --git a/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh b/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh new file mode 100755 index 00000000..769f7027 --- /dev/null +++ b/tutorials/data-lake/pull-dataset/rucio_dataset_files.sh @@ -0,0 +1,34 @@ +#/bin/bash +# +# G. Guerrieri & E. Garcia (CERN) - Jun 2024 +# +# This script runs only on VEGA +# +# Usage - on a terminal run +# > ./rucio_dataset_files.sh + +set -e + +ds=$1 +name=$2 +location=$3 + +pw=`pwd -P` + +if [[ -f "${name}" ]]; then rm ${name}.txt; fi +touch ${name}.txt + +if [ -d "${location}" ]; then echo -e "Directory exists. Exiting\n${pw}/${location}" ; exit 1 ; fi +mkdir $location + +for file in `rucio list-file-replicas --rse VEGA-DCACHE $ds | awk '{ print $12 }' | sed 's|https://dcache.sling.si:2880|/dcache/sling.si|g'` +do + if [[ $file == "|" ]]; then continue; fi + fileReduced=`basename $file` + echo linking $fileReduced "..." + link=$location/${ds/:/.}.$fileReduced + ln -s $file $link + echo ${pw}/$link >> ${name}.txt +done + +chmod -R 777 $3