summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMateusz Pilat <m.pilat@partner.samsung.com>2020-07-09 15:31:58 +0200
committerMateusz Pilat <m.pilat@partner.samsung.com>2020-07-13 11:36:21 +0200
commitb3babc7d1c7c6d2a312df79a79c5a09faf79da26 (patch)
tree34686e63891755ba2853578b82bc3003c822af54
parent8370ac160a97b856a994c4a0b8a178675df195a1 (diff)
Refactor helm-healer script
Extending helm-helper.sh funtionalities: -detecting orphaned resources -removing persistance storage for volumes Issue-ID: OOM-2074 Signed-off-by: Mateusz Pilat <m.pilat@partner.samsung.com> Change-Id: Ia7fd21ab61bfedc39647a30528a122a49a8fd79f
-rwxr-xr-xtools/helm-healer.sh190
1 files changed, 122 insertions, 68 deletions
diff --git a/tools/helm-healer.sh b/tools/helm-healer.sh
index a6e5b398..92ddbdb7 100755
--- a/tools/helm-healer.sh
+++ b/tools/helm-healer.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
@@ -191,6 +191,7 @@ helm_undeploy()
{
msg "Undeploy helm release name: ${1}"
helm undeploy ${1} --purge
+ sleep 15s
}
# arg: <job name>
@@ -212,41 +213,64 @@ delete_job()
done
}
-# arg: <resource> <release name>
-delete_resource()
+#arg: <component>
+get_resources_for_component()
{
- _resource="$1"
- _release="$2"
- msg "Delete ${_resource} for ${_release}..."
- {
- kubectl get ${_resource} -n ${NAMESPACE} \
- --ignore-not-found=true \
- --selector="release=${_release}" \
- --no-headers=true
-
- # this is due to missing "release" label in some pods
- # grep for the rescue...
- kubectl get ${_resource} -n ${NAMESPACE} \
- --no-headers=true | grep "^${_release}[-]"
- } | awk '{print $1}' | sort -u | while read -r _name _rest ; do
- echo "Deleting '${_name}'"
- kubectl delete ${_resource} -n ${NAMESPACE} \
- --cascade=true \
- --now=true \
- --wait=true \
- ${_name} \
- 2>&1 | grep -iv 'not[[:space:]]*found'
-
- # wait for resource to be deleted
- _output=start
- while [ -n "$_output" ] && sleep 1 ; do
- _output=$(kubectl get ${_resource} -n ${NAMESPACE} \
- --ignore-not-found=true \
- --no-headers=true \
- --field-selector="metadata.name=${_name}")
- done
+helm status $1 | awk -f <(cat - <<-'EOD'
+BEGIN {
+ work="no"
+ kind=""
+ a["dummy"]=""
+}
+
+$1 ~ ":" {
+ if ( $1 == "RESOURCES:" ) {
+ work="yes"
+} else {
+ work="no"
+}
+
+}
+
+$1 == "==>" {
+ split($2, a, "[/(]")
+ kind=a[2]
+}
+
+$1 != "NAME" && $1 != "==>" && work == "yes" && $1 !~ ":" && $1 != "" {
+ printf "%s/%s\n", kind, $1
+}
+
+EOD
+)
+}
+
+# arg: <resource>
+delete_resource()
+{
+ local _resource="$1"
+ local _kind="${_resource%/*}"
+ local _name="${_resource#*/}"
+
+
+ if kubectl get ${_resource} >/dev/null 2>&1; then
+ msg "${_resource} has not been removed with helm undeploy, manual removal is required. Proceeding"
+ kubectl delete ${_resource} -n ${NAMESPACE} \
+ --cascade=true \
+ --now=true \
+ --wait=true \
+ 2>&1 | grep -iv 'not[[:space:]]*found'
+
+ # wait for resource to be deleted
+ _output=start
+ while [ -n "$_output" ] && sleep 1 ; do
+ _output=$(kubectl get ${_kind} ${_name} -n ${NAMESPACE} \
+ --ignore-not-found=true \
+ --no-headers=true )
done
+ msg "Done"
+ fi
}
delete_namespace()
@@ -267,7 +291,18 @@ delete_namespace()
done
}
-# arg: [optional: subdir]
+delete_persistent_volume()
+{
+ _persistent_volume=$1
+ if kubectl get ${_persistent_volume} >/dev/null 2>&1; then
+ msg "${_persistent_volume} has not been removed with helm undeploy, manual removal is required. Proceeding"
+ #very often k8s hangs on Terminating state for pv due to still active pvc. It is better to delete pvc directly
+ _claim=$(kubectl get ${_persistent_volume} -o jsonpath='{ .spec.claimRef.name}')
+ delete_resource PersistentVolumeClaim/${_claim}
+ fi
+}
+
+# arg: [optional: directory]
delete_storage()
{
_node=$(kubectl get nodes \
@@ -280,15 +315,11 @@ delete_storage()
error "Could not list kubernetes nodes - SKIPPING DELETION"
else
if [ -n "$1" ] ; then
- msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node"
- ssh -T $_node <<EOF
-rm -rf "${VOLUME_STORAGE}/${1}"
-EOF
+ msg "Delete directory '${1}' on $_node"
+ ssh $_node "rm -rf '${1}'"
else
msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
- ssh -T $_node <<EOF
-find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;
-EOF
+ ssh $_node "find '${VOLUME_STORAGE}' -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;"
fi
fi
}
@@ -308,13 +339,7 @@ docker_cleanup()
for _node in $_nodes ; do
msg "Docker cleanup on $_node"
- {
- ssh -T $_node >/dev/null <<EOF
-if which docker >/dev/null ; then
- docker system prune --force --all --volumes
-fi
-EOF
- } &
+ ssh $_node "docker system prune --force --all --volumes" >/dev/null &
done
msg "We are waiting now for docker cleanup to finish on all nodes..."
@@ -338,28 +363,57 @@ is_helm_serve_running()
# arg: <release name>
undeploy_component()
{
- _chart=$(echo "$1" | sed 's/[^-]*-//')
- helm_undeploy ${1}
-
- # for all kubernetes resources: kubectl api-resources
- # TODO: does deleted secret per component break something?
- for x in jobs \
- deployments \
- services \
- replicasets \
- statefulsets \
- daemonsets \
- pods \
- pvc \
- pv \
- ;
- do
- delete_resource ${x} ${1}
+ local _component=$1
+
+ #Because Helm undeploy is not reliable: Gathering resources assigned to componen to track and remove orphans later
+ _component_resources=($(get_resources_for_component ${_component}))
+
+ declare -a _persistent_volumes
+ declare -a _standard
+ declare -a _unknown_kinds
+
+ for resource in ${_component_resources[@]}; do
+ case $resource in
+ CronJob/* | Job/* | Secret/* | ConfigMap/* | Pod/* | Service/* | Deployment/* | StatefulSet/*)
+ _standard+=(${resource});;
+ #Ignoring PVC, they will be handled along with PV as 'helm' status does not return them for some components
+ PersistentVolumeClaim/*)
+ ;;
+ PersistentVolume/*)
+ _persistent_volumes+=(${resource});;
+ *)
+ _unknown_kinds+=(${resource})
+ esac
done
- if [ -n "$VOLUME_STORAGE" ] ; then
- msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}"
- delete_storage "$1"
+
+ #Gathering physical location of directories for persistent volumes to delete them after undeploy
+ declare -a _physical_locations
+ for volume in ${_persistent_volumes[@]}; do
+ _physical_locations+=($(kubectl get ${volume} -o jsonpath='{ .spec.hostPath.path}' ))
+ done
+
+ helm_undeploy ${_component}
+
+ #Manual items removal
+ for resource in ${_standard[@]}; do
+ delete_resource ${resource}
+ done
+
+ for volume in ${_persistent_volumes[@]}; do
+ delete_persistent_volume ${volume}
+ done
+
+ for subdir in ${_physical_locations[@]}; do
+ delete_storage ${subdir}
+ done
+
+ if [ "${#_unknown_kinds[@]}" -ne 0 ] ; then
+ for resource in ${_unknown_kinds[@]}; do
+ error "Untracked resource kind present: ${resource}, attempting to delete it..."
+ delete_resource ${resource}
+ done
+ return
fi
}