From d1c92efe71f29f4e19bc4c7080094f5e1c447879 Mon Sep 17 00:00:00 2001 From: Mateusz Pilat Date: Thu, 9 Jul 2020 15:31:58 +0200 Subject: Refactor helm-healer script Extending helm-helper.sh funtionalities: -detecting orphaned resources -removing persistance storage for volumes Issue-ID: OOM-2074 Signed-off-by: Mateusz Pilat Change-Id: Ia7fd21ab61bfedc39647a30528a122a49a8fd79f (cherry picked from commit b3babc7d1c7c6d2a312df79a79c5a09faf79da26) --- tools/helm-healer.sh | 190 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 122 insertions(+), 68 deletions(-) diff --git a/tools/helm-healer.sh b/tools/helm-healer.sh index a6e5b398..92ddbdb7 100755 --- a/tools/helm-healer.sh +++ b/tools/helm-healer.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin @@ -191,6 +191,7 @@ helm_undeploy() { msg "Undeploy helm release name: ${1}" helm undeploy ${1} --purge + sleep 15s } # arg: @@ -212,41 +213,64 @@ delete_job() done } -# arg: -delete_resource() +#arg: +get_resources_for_component() { - _resource="$1" - _release="$2" - msg "Delete ${_resource} for ${_release}..." - { - kubectl get ${_resource} -n ${NAMESPACE} \ - --ignore-not-found=true \ - --selector="release=${_release}" \ - --no-headers=true - - # this is due to missing "release" label in some pods - # grep for the rescue... - kubectl get ${_resource} -n ${NAMESPACE} \ - --no-headers=true | grep "^${_release}[-]" - } | awk '{print $1}' | sort -u | while read -r _name _rest ; do - echo "Deleting '${_name}'" - kubectl delete ${_resource} -n ${NAMESPACE} \ - --cascade=true \ - --now=true \ - --wait=true \ - ${_name} \ - 2>&1 | grep -iv 'not[[:space:]]*found' - - # wait for resource to be deleted - _output=start - while [ -n "$_output" ] && sleep 1 ; do - _output=$(kubectl get ${_resource} -n ${NAMESPACE} \ - --ignore-not-found=true \ - --no-headers=true \ - --field-selector="metadata.name=${_name}") - done +helm status $1 | awk -f <(cat - <<-'EOD' +BEGIN { + work="no" + kind="" + a["dummy"]="" +} + +$1 ~ ":" { + if ( $1 == "RESOURCES:" ) { + work="yes" +} else { + work="no" +} + +} + +$1 == "==>" { + split($2, a, "[/(]") + kind=a[2] +} + +$1 != "NAME" && $1 != "==>" && work == "yes" && $1 !~ ":" && $1 != "" { + printf "%s/%s\n", kind, $1 +} + +EOD +) +} + +# arg: +delete_resource() +{ + local _resource="$1" + local _kind="${_resource%/*}" + local _name="${_resource#*/}" + + + if kubectl get ${_resource} >/dev/null 2>&1; then + msg "${_resource} has not been removed with helm undeploy, manual removal is required. Proceeding" + kubectl delete ${_resource} -n ${NAMESPACE} \ + --cascade=true \ + --now=true \ + --wait=true \ + 2>&1 | grep -iv 'not[[:space:]]*found' + + # wait for resource to be deleted + _output=start + while [ -n "$_output" ] && sleep 1 ; do + _output=$(kubectl get ${_kind} ${_name} -n ${NAMESPACE} \ + --ignore-not-found=true \ + --no-headers=true ) done + msg "Done" + fi } delete_namespace() @@ -267,7 +291,18 @@ delete_namespace() done } -# arg: [optional: subdir] +delete_persistent_volume() +{ + _persistent_volume=$1 + if kubectl get ${_persistent_volume} >/dev/null 2>&1; then + msg "${_persistent_volume} has not been removed with helm undeploy, manual removal is required. Proceeding" + #very often k8s hangs on Terminating state for pv due to still active pvc. It is better to delete pvc directly + _claim=$(kubectl get ${_persistent_volume} -o jsonpath='{ .spec.claimRef.name}') + delete_resource PersistentVolumeClaim/${_claim} + fi +} + +# arg: [optional: directory] delete_storage() { _node=$(kubectl get nodes \ @@ -280,15 +315,11 @@ delete_storage() error "Could not list kubernetes nodes - SKIPPING DELETION" else if [ -n "$1" ] ; then - msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node" - ssh -T $_node </dev/null </dev/null ; then - docker system prune --force --all --volumes -fi -EOF - } & + ssh $_node "docker system prune --force --all --volumes" >/dev/null & done msg "We are waiting now for docker cleanup to finish on all nodes..." @@ -338,28 +363,57 @@ is_helm_serve_running() # arg: undeploy_component() { - _chart=$(echo "$1" | sed 's/[^-]*-//') - helm_undeploy ${1} - - # for all kubernetes resources: kubectl api-resources - # TODO: does deleted secret per component break something? - for x in jobs \ - deployments \ - services \ - replicasets \ - statefulsets \ - daemonsets \ - pods \ - pvc \ - pv \ - ; - do - delete_resource ${x} ${1} + local _component=$1 + + #Because Helm undeploy is not reliable: Gathering resources assigned to componen to track and remove orphans later + _component_resources=($(get_resources_for_component ${_component})) + + declare -a _persistent_volumes + declare -a _standard + declare -a _unknown_kinds + + for resource in ${_component_resources[@]}; do + case $resource in + CronJob/* | Job/* | Secret/* | ConfigMap/* | Pod/* | Service/* | Deployment/* | StatefulSet/*) + _standard+=(${resource});; + #Ignoring PVC, they will be handled along with PV as 'helm' status does not return them for some components + PersistentVolumeClaim/*) + ;; + PersistentVolume/*) + _persistent_volumes+=(${resource});; + *) + _unknown_kinds+=(${resource}) + esac done - if [ -n "$VOLUME_STORAGE" ] ; then - msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}" - delete_storage "$1" + + #Gathering physical location of directories for persistent volumes to delete them after undeploy + declare -a _physical_locations + for volume in ${_persistent_volumes[@]}; do + _physical_locations+=($(kubectl get ${volume} -o jsonpath='{ .spec.hostPath.path}' )) + done + + helm_undeploy ${_component} + + #Manual items removal + for resource in ${_standard[@]}; do + delete_resource ${resource} + done + + for volume in ${_persistent_volumes[@]}; do + delete_persistent_volume ${volume} + done + + for subdir in ${_physical_locations[@]}; do + delete_storage ${subdir} + done + + if [ "${#_unknown_kinds[@]}" -ne 0 ] ; then + for resource in ${_unknown_kinds[@]}; do + error "Untracked resource kind present: ${resource}, attempting to delete it..." + delete_resource ${resource} + done + return fi } -- cgit 1.2.3-korg