#!/bin/sh
# PCP QA Test No. 808
# exercise pmlogger stale pid files issues
#
# Copyright (c) 2015 Red Hat.  All Rights Reserved.
# Copyright (c) 2015 Mark Goodwin.  All Rights Reserved.
#

seq=`basename $0`
echo "QA output created by $seq"

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

_cleanup()
{
    cd $here
    if [ -n "$pid" ]
    then
	$sudo rm -f $PCP_TMP_DIR/pmlogger/$pid $PCP_RUN_DIR/pmlogger.$pid.socket
    fi
    $sudo rm -f $PCP_RUN_DIR/pmlogger.pid
    _restore_auto_restart pmlogger
    _service pmlogger restart | _filter_pcp_start

    if $pmproxy_was_running
    then
	# pmproxy needs a chance to get stable ... otherwise we may see
	# badness in the PMNS for the pmproxy.* metrics if check's callback
	# checks the PMNS .. seen on vm11 (Debian 11.1)
	#
	echo "Restart pmproxy ..." >>$here/$seq.full
	_service pmproxy restart >>$here/$seq.full 2>&1
	_wait_for_pmproxy
    fi

    rm -rf $tmp.*
}

status=1	# failure is the default!
$sudo rm -rf $tmp.* $seq.full
trap "_cleanup; exit \$status" 0 1 2 3 15

_stop_auto_restart pmlogger

pmproxy_was_running=false
[ -f $PCP_RUN_DIR/pmproxy.pid ] && pmproxy_was_running=true
echo "pmproxy_was_running=$pmproxy_was_running" >>$here/$seq.full

# real QA test starts here

pid=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep 'pmlogger.*-P' | \
grep -v grep | awk '{print $2}'`

if [ ! -S "$PCP_RUN_DIR/pmlogger.$pid.socket" ]; then
    echo FAIL no primary pmlogger running? pid=$pid
    status=1
else
    echo found primary pmlogger and found control socket in PCP_RUN_DIR

    echo === killing the primary pmlogger with SIGKILL ===
    $sudo kill -9 $pid
    for i in 1 2 3 4 5
    do
	$sudo kill -0 $pid >/dev/null 2>&1 || break
	sleep 1
    done
    $sudo kill -0 $pid >/dev/null 2>&1 && echo "Arrggh, pid $pid won't die!"

    echo === restart primary logger ===
    _service pmlogger start | _filter_pcp_start
    _wait_for_pmlogger

    newpid=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep 'pmlogger.*-P' | \
	grep -v grep | awk '{print $2}'`

    if [ -z "$newpid" ]; then
    	echo FAIL pmlogger failed to start 
	echo pid of killed primary logger is $pid
	echo === ls -l $PCP_RUN_DIR ===
	ls -l $PCP_RUN_DIR
	status=1
	exit
    fi

    # cleanup from SIGKILL
    #
    $sudo rm -f $PCP_TMP_DIR/pmlogger/$pid $PCP_RUN_DIR/pmlogger.pid $PCP_RUN_DIR/pmlogger.$pid.socket


#
# Next test: check primary logger restarts with legacy hardlinks present
# This is a corner case, but can happen.
#
    pid=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep 'pmlogger.*-P' | \
    grep -v grep | awk '{print $2}'`

    echo === checking with stale legacy hard links
    echo === killing the primary pmlogger with SIGKILL ===
    $sudo kill -9 $pid
    for i in 1 2 3 4 5
    do
	$sudo kill -0 $pid >/dev/null 2>&1 || break
	sleep 1
    done
    $sudo kill -0 $pid >/dev/null 2>&1 && echo "Arrggh, pid $pid won't die!"

    # change the stale symlinks into hardlinks, to simulate an upgrade
    # from an earlier version of pcp (before we converted to symlinks)
    $sudo rm -f $PCP_TMP_DIR/pmlogger/primary
    $sudo ln $PCP_TMP_DIR/pmlogger/$pid $PCP_TMP_DIR/pmlogger/primary
    $sudo rm -f $PCP_RUN_DIR/pmlogger.primary.socket
    $sudo ln $PCP_RUN_DIR/pmlogger.$pid.socket $PCP_RUN_DIR/pmlogger.primary.socket

    echo === restart primary logger ===
    _service pmlogger start | _filter_pcp_start
    _wait_for_pmlogger

    newpid=`$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep 'pmlogger.*-P' | \
	grep -v grep | awk '{print $2}'`

    if [ -z "$newpid" ]; then
    	echo FAIL pmlogger failed to start 
	echo pid of killed primary logger is $pid
	echo === ls -il $PCP_RUN_DIR ===
	ls -il $PCP_RUN_DIR
	status=1
	exit
    fi

    # cleanup from SIGKILL
    #
    $sudo rm -f $PCP_TMP_DIR/pmlogger/$pid $PCP_RUN_DIR/pmlogger.pid $PCP_RUN_DIR/pmlogger.$pid.socket

    # success
    status=0
fi

exit
