schednodes

#!/bin/bash

set -o pipefail \
    -o errexit  \
    -o nounset

schedule_dir="$SGE_ROOT/default/common"

# 5038786:120:STARTING:1523502129:44100:H:node-u05a-014:slots:4.000000
# job id:task_id:reason:start time:end time:level:object:resource:utilization

function usage() {
    echo "Usage: ${0##./} [--all|--schedule-file=FILE] [job id [task id]]

    Parses schedule files and prints the nodes jobs were scheduled to start on.

    By default, parses only the current schedule file: if this is rotated,
      it will only show a limited selection of entries.

    Note that the file arguments will automatically use the appropriate gzip
      tool to decompress files ending with .gz.

    Options:
      -h,--help             Print this message.
      --schedule-file=FILE  Parse FILE instead of the current schedule file.
      --all                 Parse current schedule file and all files named
                              \$SGE_ROOT/default/common/schedule-????????.gz
      --daysback=NUM        Parse schedule files named as being from NUM+1 days
                              or more recent.
    "
}

# Leaks in a schedule log file name (ew, FIXME), decompresses if necessary using appropriate tools
#  and then, if args are provided, filters for a job ID and task ID
function get_sched_lines () {
    # Select tools to transparently handle compressed files
    if [[ "${schedule_file##*\.}" == "gz" ]] && [[ ! "$schedule_file" == "gz" ]]; then
        CAT="zcat"
        GREP="zgrep"
    elif [[ "${schedule_file##*\.}" == "xz" ]]; then
        CAT="xzcat"
        GREP="xzgrep"
    else
        CAT="cat"
        GREP="grep"
    fi

    if [[ -z "${1:-}" ]]; then
        # If we don't have any args, just decompress
        "$CAT" "$schedule_file"
    elif [[ -n "${1:-}" ]] && [[ -z "${2:-}" ]]; then
        # If we only have a job ID
        if [[ ! "$1" =~ ^[0-9]*$ ]]; then
            echo "Error: invalid job id: must be numeric only" >&2
            exit 2
        fi
        "$GREP" -e "^$1:" -e :::::::: "$schedule_file"
    else
        # If we have a job ID and a task ID
        if [[ ! "$1" =~ ^[0-9]*$ ]]; then
            echo "Error: invalid job id: must be numeric only" >&2
            exit 2
        elif [[ ! "$2" =~ ^[0-9]*$ ]]; then
            echo "Error: invalid task id: must be numeric only" >&2
            exit 3
        fi
        "$GREP" -e "^$1:$2:" -e :::::::: "$schedule_file"
    fi
}

# Takes a section of schedule log as stdin and outputs e.g.
# [Tue Sep 24 22:37:45 BST 2024] 1730421.1: node-j00a-002.myriad.ucl.ac.uk node-j00a-003.myriad.ucl.ac.uk
# May contain multiple copies of the same hostname for some types of job
function concat_nodes() {
    awk -F: '
        (($3 == "STARTING") && ($6 == "H") && ($8 == "slots")) {
            # We make a hash key of the time, the job ID, and the task ID
            #  and use it to accumulate hostnames as a single string
            #  when we find schedule log entries for jobs starting
            # See: `man sge_schedule` for more info on log format
            jobs[$4 " " $1 "." $2]=jobs[$4 " " $1 "." $2] " " $7
        }
        ($0 == "::::::::") {
            # Indicates the beginning of a new scheduling cycle
            #  so dump info from last cycle
            for (job in jobs) {
                split(job, job_label_parts, " ")
                time_stamp = job_label_parts[1]
                job_id_and_task_id = job_label_parts[2]
                print "[" strftime("%a %b %e %H:%M:%S %Z %Y", time_stamp) "] " job_id_and_task_id ":" jobs[job]
                delete jobs[job]
            }
            delete jobs
        }
        END {
            # Final dump
            for (job in jobs) {
                split(job, job_label_parts, " ")
                time_stamp = job_label_parts[1]
                job_id_and_task_id = job_label_parts[2]
                print "[" strftime("%a %b %e %H:%M:%S %Z %Y", time_stamp) "] " job_id_and_task_id ":" jobs[job]
                delete jobs[job]
            }
        }
    '
}

function main() {
    first_arg="${1:-}"
    if [[ "${first_arg:0:16}" == "--schedule-file=" ]]; then
        schedule_files=("${first_arg#--schedule-file=}")
        shift
    elif [[ "${first_arg}" == "--all" ]]; then
        schedule_files=($(/bin/ls -f "$schedule_dir"/schedule "$schedule_dir"/schedule-????????.gz) $(/bin/ls -f "$schedule_dir"/schedule-????????.*.xz))
        shift
    elif [[ "${first_arg:0:11}" == "--daysback=" ]]; then
        num_days="${first_arg:11}"
        declare -a schedule_files
        days=0
        while [[ $days -le $num_days ]]; do
            dated_file_stem="$schedule_dir/schedule-$(date --date="$days days ago" +%Y%m%d)"
            schedule_files+=($(/bin/ls -f "$dated_file_stem"* 2>/dev/null))
            days=$(( days + 1 ))
        done
        shift
    elif [[ "${first_arg}" == "-h" ]] || [[ "${first_arg}" == "--help" ]]; then
        usage
        exit
    else
        schedule_files=("$schedule_dir/schedule")
    fi

    for schedule_file in "${schedule_files[@]}"; do
        if [[ ! -r "$schedule_file" ]]; then
            echo "Error: could not read schedule file: $schedule_file" >&2
            exit 4
        fi

        get_sched_lines "${1:-}" "${2:-}" | concat_nodes
    done 
}

main "$@"