#!/bin/bash # Copyright (C) 2017 AT&T Intellectual Property. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this code except in compliance # with the License. You may obtain a copy of the License # at http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. See the License for the specific language governing # permissions and limitations under the License. # NAME # check_cluster - check the state of the cluster # # USAGE # check_cluster [-v] [-l] [-t timeout] # -l do not check localhost first (and restarting the service if necessary) # -t timeout set how long to wait when accessing the servers # -v verbose # # DESCRIPTION # Loop through the nodes in the cluster, using pgwget to determine how many are masters, secondaries, down for maintenance, or not up. # Complain about certain situations. # If there are multiple masters, and this not the first master in the list, then: # run pg_ctl_restart # prevent /ro from returning true CDF=/opt/app/cdf if [ -d /opt/app/postgresql-9.5.2 ] then PGDIR=/opt/app/postgresql-9.5.2 else PGDIR=/usr/lib/postgresql/9.5 fi if [ -d /opt/app/postgresql-config-9.5.2/ ] then CFGDIR=/opt/app/postgresql-config-9.5.2/ else CFGDIR=/opt/app/postgresql-config/ fi PATH=$PGDIR/bin:$CDF/bin:/opt/app/pgaas/bin:/opt/app/postgresql-prep/bin:$PATH usage() { exec 1>&2 [ $# -gt 0 ] && echo "$@" echo "Usage: $0 [-v] [-l] [-t timeout] [-d file]" echo -e " -l do not check localhost first (and restarting the service if necessary)" echo -e " -t timeout set how long to wait when accessing the servers" echo -e " -v verbose" echo -e " -d file duplicate the status output to the given file" exit 1 } VERBOSE=false TIMEOUT=10 TESTLOCAL=: DFILE= while getopts d:lt:v c do case "$c" in d ) DFILE=$OPTARG ;; l ) TESTLOCAL=false ;; t ) TIMEOUT=$OPTARG ;; v ) VERBOSE=: ;; \?) usage ;; esac done shift $(($OPTIND - 1)) # loop through the nodes in the cluster, using pgwget to determine if any are a master. Save in $@ master_count=0 secondary_count=0 total_count=0 down_count=0 maintenance_count=0 MASTERS= SECONDARIES= MAINTENANCES= DOWNS= MSEP= SSEP= BSEP= DSEP= HOSTNAME=$(hostname -f) FOUNDPREVIOUSMASTER= DOPGCTLSTOP= if $TESTLOCAL then isrw=`pgwget --tries=1 --read-timeout=$TIMEOUT --quiet -O/dev/stdout http://localhost:8000/isrw` case "$isrw" in Master | Secondary | Maintenance ) ;; * ) echo "$(date)|WARNING|RESTARTED|Local iDNS-responder.py not responding. Restarting." ps -fu postgres | grep "python3 /opt/app/postgresql-prep/bin/iDNS-responder.py" | grep -v grep | awk '{print "kill " $2}' | sh sleep 10 ;; esac fi for i in $(getpropvalue -n pgnodes | sed 's/|/ /g') do $VERBOSE && echo -n "Checking $i" isrw=`pgwget --tries=1 --read-timeout=10 --quiet -O/dev/stdout http://$i:8000/isrw` $VERBOSE && echo ": $isrw" case "$isrw" in Master ) (( master_count = master_count + 1 )) (( total_count = total_count + 1 )) MASTERS="$MASTERS$MSEP$i" MSEP=" " if [ -z "$FOUNDPREVIOUSMASTER" ] then FOUNDPREVIOUSMASTER=yes elif [ -n "$FOUNDPREVIOUSMASTER" -a "$i" = $HOSTNAME ] then DOPGCTLSTOP=yes fi ;; Secondary ) (( secondary_count = secondary_count + 1 )) (( total_count = total_count + 1 )) SECONDARIES="$SECONDARIES$SSEP$i" SSEP=" " ;; Maintenance ) (( maintenance_count = maintenance_count + 1 )) (( total_count = total_count + 1 )) MAINTENANCES="$MAINTENANCES$BSEP$i" BSEP=" " ;; * ) DOWNS="$DOWNS$DSEP$i" DSEP=" " (( down_count = down_count + 1 )) (( total_count = total_count + 1 )) ;; esac done (( up_count = master_count + secondary_count )) date=$(date) output="$date|INFO|masters=$master_count $MASTERS|secondaries=$secondary_count $SECONDARIES|maintenance=$maintenance_count $MAINTENANCES|down=$down_count $DOWNS|" echo "$output" if [ -n "$DFILE" ] then (umask 022; echo "$output" > $DFILE.tmp && mv $DFILE.tmp $DFILE) fi FORCEROOFF=/var/run/postgresql/force-ro-off if [ $master_count -lt 1 ] then echo "$date|FATAL|NOMASTER|NO MASTER FOUND|" elif [ $master_count -gt 1 ] then echo "$date|FATAL|MULTIPLEMASTERS|TOO MANY MASTERS FOUND|" if [ -n "$DOPGCTLSTOP" ] then touch "$FORCEROOFF" pg_ctl_stop fi fi if [ -z "$DOPGCTLSTOP" -a -f "$FORCEROOFF" ] then rm -f "$FORCEROOFF" fi if [ $up_count -eq 0 ] then echo "$date|FATAL|NOREADER|NO SECONDARY FOUND" fi if [ $down_count -ne 0 ] then echo "$date|ERROR|DOWN|One or more systems are down: $DOWNS" fi