#!/bin/bash

# Helper script for HA/DR hooks
# currently implemented use case are 'checkTakeover' and 'fenceMe'

# For now we only check the maintenance mode of the multi-state ressource (as descibed in the official supported maintenance procedure described in man page SAPHanaSR_maitenance_examples(7) - EXAMPLES: * Perform an SAP HANA take-over by using SAP tools.

# Definition of the exit codes returned to the hook scripts:
# for the current use case 'checkTakeover' it translate to
#  0 - OK
#    - takeover requested by cluster, permit takeover
#  4 - block takeover (sr_takeover attribute not found or not set to 'T' and
#      multi-state ressource is NOT in maintenance)
#  5 - multi-state resource in maintenance, permit takeover
#  6 - cluster not available, permit takeover
#  7 - given SID not configured in the cluster, block takeover
# 99 - unknown cluster command error, permit takeover
#

# do not use '1' as return value, so we can distinguish between the return of
# the helper script and an error of 'sudo'
#

##### functions

# USAGE
usage() {
    echo "USAGE:    $0 --sid=<SID> [--ino=<InstanceNumber>] --case <use case>"
    echo -e "\\t--sid=<SID>: SID (like HA1) of the SAP system"
    echo -e "\\t--ino=<InstanceNumber>: instance number (like 10) of the SAP instance"
    echo -e "\\t--case=<use case>: the use case for the hook helper"
    echo -e "\\t                   at the moment only 'checkTakeover' and 'fenceMe' is supported"
    echo -e "\\t--version: show script version"
    echo -e "\\t--help:    show help"
    exit 1
}

# check the existence and the value of the takeover attribute
chk_takeover_attr() {
    sra_attr="hana_${ASID}_sra"
    sra_value=$(/usr/sbin/crm_attribute -n "$sra_attr" -G -t reboot -q 2>/dev/null); rc=$?
    if [ "$rc" != 0 ]; then
        case "$rc" in
        102)
            # ipc connection to cluster (or parts) lost
            # permit takeover
            rc=6
            ;;
        105)
            # sr_takeover attribute not found
            # block takeover
            rc=4
            ;;
        *)
            # other problem with crm_attribute call
            # block takeover
            rc=4
            ;;
        esac
    else
        # sr_takeover attribute found, check value
        if [ "$sra_value" == "T" ]; then
	    # permit cluster action sr_takeover
            rc=0
        else
            # block takeover
            rc=4
        fi
    fi
    return $rc
}

# get list of known SIDs from cluster config
#csids="$(xmllint -xpath "//primitive[@*]/instance_attributes/nvpair[@name='SID']/@value" "$cibtmp" 2>/dev/null | sed -e 's/value=//g' -e 's/"//g')"

# check, if given SID is configured in the cluster
sid_is_in_cluster() {
    csid="$(xmllint -xpath "string(//primitive[@*]/instance_attributes/nvpair[@name='SID' and @value='$RSID']/@value)" "$cibtmp" 2>/dev/null)"
    if [ -s "$csid" ]; then
        return 1
    else
        return 0
    fi
}

# check, if the multi-state resource is in maintenance
chk_msres_maintenance() {
    mmaint="false"
    # get multi-state resource name
    rname=$(xmllint -xpath "string(//master/primitive[@class='ocf' and @provider='suse']/instance_attributes/nvpair[@name='SID' and @value='$RSID']/../../../@id)" "$cibtmp")
    if [ -n "$rname" ]; then
        # check for maintenance
        mmaint=$(xmllint -xpath "string(//master[@id='$rname']/meta_attributes/nvpair[@name='maintenance']/@value)" "$cibtmp")
        if [ -z "$mmaint" ]; then
            mmaint="false"
        fi
    fi
    echo "$mmaint"
}

# This function needs to run already as <sid>adm
# this code is for lab only and can be removed without any notice - do not use at customer or partner systems
#
# stopSystem <InstanceNumber>
# - just sends the command to stop the system and return
stopSystem() {
    local InstanceNr="$1"
    local SAPCONTROL="sapcontrol"
    # simulate slow stop by sleeping for a while
    # TODO: shoulbd be 2*TimeToWaitMax of stopSystemAndWait()
    sleep 120
    $SAPCONTROL -nr "$InstanceNr" -function StopSystem
    return 0
}

# This function needs to run already as <sid>adm
# this code is for lab only and can be removed without any notice - do not use at customer or partner systems
#
# stopSystemAndWait <InstanceNumber> <TimeToWaitMax>
# - calls stopSystem()
# - waits for a stopped instance for a while and return, if instance is now stopped
#
stopSystemAndWait() {
    local InstanceNr="$1" TimeOut="$2"
    local SAPCONTROL="sapcontrol"
    # send stopSystem to background and wait already (to early) to allow timeout
    stopSystem "$InstanceNr"&
    # WaitforStopped returns 0 for a stopped instance and 2 for an still running instance
    $SAPCONTROL -nr "$InstanceNr" -function WaitforStopped "$TimeOut" 1; rc=$?
    return "$rc"
}

# end function definition

##### main

# variables
version="202208091454"
exe="$0"
cmd=$(basename "$exe")

# parse cmdline
if [ $# -eq 0 ]
then
    usage
elif [ $# -lt 2 ]
then
    usage
else
    while [ $# -gt 0 ]; do
        case "$1" in
        --sid=?*)
            SID=${1#*=}
            ;;
        --sid)
            SID="$2"
            shift
            ;;
        --ino=?*)
            InstanceNumber=${1#*=}
            ;;
        --ino )
            InstanceNumber="$2"
            shift
            ;;
        --case=?*)
            USECASE=${1#*=}
          ;;
        --case)
            USECASE="$2"
            shift
            ;;
        --version)
            echo "$cmd version - $version"
            exit
            ;;
        --help|-h|-\?)    usage ;;
        *)     usage ;;
        esac
        shift
    done
fi
if [ -z "$SID" ]; then
    echo "ERROR: missing SID"
    usage
fi

# uppercase SID used for resources
RSID=${SID^^}
# lowercase SID used for attributes
ASID=${SID,,}

case "$USECASE" in
"checkTakeover")
    # query CIB and write content to a temporary file to limit the
    # cluster calls to a minimum.
    cibtmp=$(mktemp /tmp/SAPHanaSR_SRHhelper.XXXXXX)
    cibadmin -Q > "$cibtmp"; rc=$?
    if [ "$rc" != 0 ]; then
        case "$rc" in
        102)
            # ipc connection to cluster (or parts) lost
            # permit takeover
            rc=6
            ;;
        *)
            rc=99
            ;;
        esac
    else
        # check, if given SID is configured in the cluster
        if sid_is_in_cluster; then
            chk_takeover_attr; rc=$?
            if [ "$rc" == 4 ]; then
                # sr_takeover attribute not found or not set to 'T'
	        # check, if multi-state ressource is in maintenance
                maint=$(chk_msres_maintenance)
                if [ "$maint" == "true" ]; then
                    # permit sr_takeover, because multi-state ressource is
                    # in maintenance
	            rc=5
                else
                    # block takeover, because sr_takeover attribute not found or not
                    # set to 'T' and multi-state ressource is NOT in maintenance
                    rc=4
                fi
            fi
        else
            # given SID not configured in the cluster, block takeover
            rc=7
        fi
    fi
    ;;
"fenceMe" )
     nodeName=$(crm_node -n)
     #stonith_admin --reboot="$nodeName"
     crm --force node fence "$nodeName"
     rc=42 # if fence is working this rc should never been send back
     ;;
"firstStopThenKill" )
     #
     # this action needs to run as <sid>adm - do not call it as root
     # this action is lab code only and can be removed without any notice
     # this code is for lab only and can be removed without any notice - do not use at customer or partner systems
     # this code does not promise that it will be part of any product later
     #
     logger -t SAPHanaSR-hookHelper "firstStopThenKill: try to stop SAP instance $InstanceNumber"
     stopSystemAndWait "$InstanceNumber" "60"; rc="$?"
     if [ "$rc" -ne 0 ]; then
         # TODO use function for logging
         logger -t SAPHanaSR-hookHelper "firstStopThenKill: SAP instance $InstanceNumber is not stopped in time - need to kill the instance"
         HDB kill-9
         # the following logger message will most likely not find the way to the system log, as the SAPHanaSR-hookHelper process will be killed also
         logger -t SAPHanaSR-hookHelper "firstStopThenKill: SAP instance $InstanceNumber killed with signal 9"
     else
         # TODO use function for logging
         logger -t SAPHanaSR-hookHelper "firstStopThenKill: SAP instance $InstanceNumber is already down - no need to kill the instance"
     fi
     rc=0
     ;;
"firstStopThenFence" )
     #
     # this action needs to run as <sid>adm - do not call it as root
     # this action is lab code only and can be removed without any notice
     # this code is for lab only and can be removed without any notice - do not use at customer or partner systems
     # this code does not promise that it will be part of any product later
     #
     stopSystemAndWait "$InstanceNumber" "60"; rc="$?"
     if [ "$rc" -ne 0 ]; then
         # TODO use function for logging
         logger -t SAPHanaSR-hookHelper "sap instance $InstanceNumber is not stopped in time - need to fence the node"
         sudo /usr/sbin/SAPHanaSR-hookHelper --sid="$SID" --case="fenceMe"
         # the following logger message will most likely not find the way to the system log, as the node will be already fenced
         logger -t SAPHanaSR-hookHelper "sap instance $InstanceNumber killed with signal 9"
     else
         # TODO use function for logging
         logger -t SAPHanaSR-hookHelper "sap instance $InstanceNumber is already down - no need to fence the node"
     fi
     rc=0
     ;;
#"nextUsecase")
#    chk_next_case; rc=$?
#    ;;
*)
    echo "ERROR: Unsupported use case '$USECASE'. Exiting...."
    usage
    rc=1
    ;;
esac

# cleanup
rm -f "$cibtmp"

# return exit status to the hook
exit $rc
