diff options
-rwxr-xr-x | tools/helm-healer.sh | 537 |
1 files changed, 537 insertions, 0 deletions
diff --git a/tools/helm-healer.sh b/tools/helm-healer.sh new file mode 100755 index 00000000..b030fcac --- /dev/null +++ b/tools/helm-healer.sh @@ -0,0 +1,537 @@ +#!/bin/sh + +PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + +# +# globals and defaults +# + +NAMESPACE= +OVERRIDES= +HELM_CHART_RELEASE_NAME= +HELM_DELETE_ALL= +HELM_SKIP_DEPLOY= +VOLUME_STORAGE= +HELM_TIMEOUT=3600 +RELEASE_PREFIX=onap + +# +# control variables +# + +CMD=$(basename "$0") +COLOR_ON_RED='\033[0;31;1m' +COLOR_ON_GREEN='\033[0;32;1m' +COLOR_OFF='\033[0m' + + +# +# functions +# + +help() +{ +cat <<EOF +${CMD} - simple tool for fixing onap helm deployment + +DESCRIPTION + This script does nothing smart or special it just tries to + redeploy onap component. It can fix only problems related to + race conditions or timeouts. Nothing else. It will not fix + broken ONAP - there is no such ambition - that effort should + be directed in the upstream. + +USAGE + ${CMD} -h|--help + This help + + ${CMD} -n|--namespace <namespace> + (-f|--file <override>)... + (-s|--storage <directory>)|--no-storage-deletion + [-p|--release-prefix <release prefix>] + [-t|--timeout <secs>] + [(-c|--component <component release name>)...| + (-D|--delete-all)] + [-C|--clean-only] + + Usage 1 (simple heuristics - redeploy failed components): + ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs + + Usage 2 (redeploy ONLY explicit listed components): + ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \ + -c onap-aaf -c onap-sdc -c onap-portal + + Usage 3 (delete EVERYTHING and redeploy): + ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \ + --delete-all + + Usage 4 (just clean - do not redeploy) + ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \ + --delete-all --clean-only + + Namespace argument and at least one override file are mandatory + for this script to execute. Also you must provide path to the + storage or explicitly request to not delete file storage of the + component. + + Storage should be directory where persistent volume resides. It + will work only if component created a persistent volume with the + same filename as its release name. Otherwise no effect. The + exception is when '--delete-all' is used - in that case all + content of the storage is deleted (because ONAP is not consistent + with the volume directory names - eg.: sdnc). + + CAUTION 1: filename of an override file cannot contain whitespace! + This is actually helm/onap deploy plugin issue which does not + handle such files. So I dropped the more complicated version of + this script when there is no reason to support something on what + will helm deploy choke anyway. + + '--prefix' option is helm release argument - it is actually prefix + when you list the helm releases - helm is little confusing here. + + CAUTION 2: By default release prefix is 'onap' - if you deployed + release 'onap' and now run this script with different prefix then + it will skip all 'onap-*' components and will deploy a new release + with new prefix - BEWARE TO USE PROPER RELEASE PREFIX! + + Timeout set the waiting time for helm deploy per component. + + '--component' references to release name of the chart which you + want to redeploy excplicitly - otherwise 'ALL FAILED' components + will be redeployed. You can target more than one component at once + - just use the argument multiple times. + + Component option is mutually exclusive with the '--delete-all' + which will delete all components - healthy or not. Actually it will + delete the whole NAMESPACE and everything in it. + + '--clean-only' can be used with any usage: heuristics, explicit + component list or with '--delete-all'. It basically just skips the + last step - the actual redeploy. +EOF +} + +msg() +{ + echo -e "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}" +} + +error() +{ + echo -e "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}" +} + +# remove all successfully completed jobs +clean_jobs() +{ + kubectl get jobs -n ${NAMESPACE} \ + --ignore-not-found=true \ + --no-headers=true | \ + while read -r _job _completion _duration _age ; do + _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}') + _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}') + if [ "$_desired" -eq "$_done" ] ; then + delete_job "$_job" + fi + done +} + +get_failed_labels() +{ + get_labels 'status.phase==Failed' +} + +# arg: [optional: selector] +get_labels() +{ + if [ -n "$1" ] ; then + _selector="--field-selector=${1}" + else + _selector= + fi + + kubectl get pods -n ${NAMESPACE} \ + --show-labels=true \ + --include-uninitialized=true \ + ${_selector} \ + --ignore-not-found=true \ + --no-headers=true | \ + while read -r _pod _ready _status _restart _age _labels ; do + [ -z "$_labels" ] && break + for _label in $(echo "$_labels" | tr ',' ' ') ; do + case "$_label" in + release=*) + _label=$(echo "$_label" | sed 's/release=//') + echo "$_label" + ;; + esac + done + done | sort -u +} + +# arg: <release name> +helm_undeploy() +{ + msg "Undeploy helm release name: ${1}" + helm undeploy ${1} --purge +} + +# arg: <job name> +delete_job() +{ + kubectl delete job -n ${NAMESPACE} \ + --cascade=true \ + --now=true \ + --include-uninitialized=true \ + --wait=true \ + ${1} + + # wait for job to be deleted + _output=start + while [ -n "$_output" ] && sleep 1 ; do + _output=$(kubectl get pods -n ${NAMESPACE} \ + --ignore-not-found=true \ + --no-headers=true \ + --selector="job-name=${1}") + done +} + +# arg: <resource> <release name> +delete_resource() +{ + _resource="$1" + _release="$2" + + msg "Delete ${_resource} for ${_release}..." + { + kubectl get ${_resource} -n ${NAMESPACE} \ + --ignore-not-found=true \ + --selector="release=${_release}" \ + --no-headers=true + + # this is due to missing "release" label in some pods + # grep for the rescue... + kubectl get ${_resource} -n ${NAMESPACE} \ + --no-headers=true | grep "^${_release}" + } | awk '{print $1}' | sort -u | while read -r _name _rest ; do + echo "Deleting '${_name}'" + kubectl delete ${_resource} -n ${NAMESPACE} \ + --cascade=true \ + --now=true \ + --include-uninitialized=true \ + --wait=true \ + ${_name} \ + 2>&1 | grep -iv 'not[[:space:]]*found' + + # wait for resource to be deleted + _output=start + while [ -n "$_output" ] && sleep 1 ; do + _output=$(kubectl get ${_resource} -n ${NAMESPACE} \ + --ignore-not-found=true \ + --no-headers=true \ + --field-selector="metadata.name=${_name}") + done + done +} + +delete_namespace() +{ + msg "Delete the whole namespace: ${NAMESPACE}" + kubectl delete namespace \ + --cascade=true \ + --now=true \ + --include-uninitialized=true \ + --wait=true \ + "$NAMESPACE" + + # wait for namespace to be deleted + _output=start + while [ -n "$_output" ] && sleep 1 ; do + _output=$(kubectl get all -n ${NAMESPACE} \ + --ignore-not-found=true \ + --no-headers=true) + done +} + +# arg: [optional: subdir] +delete_storage() +{ + _node=$(kubectl get nodes \ + --selector=node-role.kubernetes.io/worker \ + -o wide \ + --no-headers=true | \ + awk '{print $6}' | head -n 1) + + if [ -z "$_node" ] ; then + error "Could not list kubernetes nodes - SKIPPING DELETION" + else + if [ -n "$1" ] ; then + msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node" + ssh -T $_node <<EOF +rm -rf "${VOLUME_STORAGE}/${1}" +EOF + else + msg "Delete directories '${VOLUME_STORAGE}/*' on $_node" + ssh -T $_node <<EOF +find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \; +EOF + fi + fi +} + +# arg: <release name> +redeploy_component() +{ + _chart=$(echo "$1" | sed 's/[^-]*-//') + helm_undeploy ${1} + # TODO: does deleted secret per component break something? + for x in jobs deployments pods pvc pv ; do + delete_resource ${x} ${1} + done + + if [ -n "$VOLUME_STORAGE" ] ; then + msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}" + delete_storage "$1" + fi + + # TODO: until I can verify that this does the same for this component as helm deploy + #msg "Redeployment of the component ${1}..." + #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT} +} + + +# +# arguments +# + +state=nil +arg_namespace= +arg_overrides= +arg_timeout= +arg_storage= +arg_nostorage= +arg_components= +arg_prefix= +arg_deleteall= +arg_cleanonly= +while [ -n "$1" ] ; do + case $state in + nil) + case "$1" in + -h|--help) + help + exit 0 + ;; + -n|--namespace) + state=namespace + ;; + -f|--file) + state=override + ;; + -t|--timeout) + state=timeout + ;; + -s|--storage) + state=storage + ;; + --no-storage-deletion) + if [ -n "$arg_storage" ] ; then + error "Usage of storage argument together with no storage deletion option!" + exit 1 + elif [ -z "$arg_nostorage" ] ; then + arg_nostorage=nostorage + else + error "Duplicit argument for no storage option! (IGNORING)" + fi + ;; + -c|--component) + if [ -n "$arg_deleteall" ] ; then + error "'Delete all components' used already - argument mismatch" + exit 1 + fi + state=component + ;; + -D|--delete-all) + if [ -n "$arg_components" ] ; then + error "Explicit component(s) provided already - argument mismatch" + exit 1 + elif [ -z "$arg_deleteall" ] ; then + arg_deleteall=deleteall + else + error "Duplicit argument for 'delete all' option! (IGNORING)" + fi + ;; + -p|--prefix) + state=prefix + ;; + -C|--clean-only) + if [ -z "$arg_cleanonly" ] ; then + arg_cleanonly=cleanonly + else + error "Duplicit argument for 'clean only' option! (IGNORING)" + fi + ;; + *) + error "Unknown parameter: $1" + exit 1 + ;; + esac + ;; + namespace) + if [ -z "$arg_namespace" ] ; then + arg_namespace="$1" + state=nil + else + error "Duplicit argument for namespace!" + exit 1 + fi + ;; + override) + if ! [ -f "$1" ] ; then + error "Wrong filename for override file: $1" + exit 1 + fi + arg_overrides="${arg_overrides} -f $1" + state=nil + ;; + component) + arg_components="${arg_components} $1" + state=nil + ;; + prefix) + if [ -z "$arg_prefix" ] ; then + arg_prefix="$1" + state=nil + else + error "Duplicit argument for release prefix!" + exit 1 + fi + ;; + timeout) + if [ -z "$arg_timeout" ] ; then + if ! echo "$1" | grep -q '^[0-9]\+$' ; then + error "Timeout must be an integer: $1" + exit 1 + fi + arg_timeout="$1" + state=nil + else + error "Duplicit argument for timeout!" + exit 1 + fi + ;; + storage) + if [ -n "$arg_nostorage" ] ; then + error "Usage of storage argument together with no storage deletion option!" + exit 1 + elif [ -z "$arg_storage" ] ; then + arg_storage="$1" + state=nil + else + error "Duplicit argument for storage!" + exit 1 + fi + ;; + esac + shift +done + +# sanity check +if [ -z "$arg_namespace" ] ; then + error "Missing namespace" + help + exit 1 +else + NAMESPACE="$arg_namespace" +fi + +if [ -z "$arg_overrides" ] ; then + error "Missing override file(s)" + help + exit 1 +else + OVERRIDES="$arg_overrides" +fi + +if [ -n "$arg_prefix" ] ; then + RELEASE_PREFIX="$arg_prefix" +fi + +if [ -n "$arg_timeout" ] ; then + HELM_TIMEOUT="$arg_timeout" +fi + +if [ -n "$arg_storage" ] ; then + VOLUME_STORAGE="$arg_storage" +elif [ -z "$arg_nostorage" ] ; then + error "Missing storage argument! If it is intended then use '--no-storage-deletion' option" + exit 1 +fi + +if [ -n "$arg_components" ] ; then + HELM_CHART_RELEASE_NAME="$arg_components" +fi + +if [ -n "$arg_deleteall" ] ; then + HELM_DELETE_ALL=yes +fi + +if [ -n "$arg_cleanonly" ] ; then + HELM_SKIP_DEPLOY=yes +fi + + +# +# main +# + +# if --delete-all is used then redeploy all components (the current namespace is deleted) +if [ -n "$HELM_DELETE_ALL" ] ; then + # undeploy helm release (prefix) + helm_undeploy "$RELEASE_PREFIX" + + # we will delete the whole namespace + delete_namespace + + if [ -n "$VOLUME_STORAGE" ] ; then + delete_storage + fi +# delete and redeploy explicit or failed components... +else + # if a helm chart release name was given then just redeploy said component and quit + if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then + msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}" + _COMPONENTS="$HELM_CHART_RELEASE_NAME" + # simple heuristics: redeploy only failed components + else + msg "Delete successfully completed jobs..." + clean_jobs + + msg "Find failed components..." + _COMPONENTS=$(get_failed_labels) + fi + + for _component in ${_COMPONENTS} ; do + if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then + msg "Redeploy component: ${_component}" + redeploy_component ${_component} + else + error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)" + fi + done +fi + +if [ -z "$HELM_SKIP_DEPLOY" ] ; then + # TODO: this is suboptimal - find a way how to deploy only the affected component... + msg "Redeploy onap..." + msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT} + helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT} +else + msg "Clean only option used: Skipping redeploy..." +fi + +msg DONE + +exit $? + |