From 3469bc69e5aef431576f8e7400b57740e52f967f Mon Sep 17 00:00:00 2001 From: Fiete Ostkamp Date: Thu, 8 Aug 2024 09:28:40 +0200 Subject: Improve readiness image to allow for shorter check intervals - introduce interval parameter that can be passed to define check interval - reduce default wait intervals from between 5-11 to 2-6 seconds - move checks to separate methods [0] [0] this prepares executing the checks in parallel in another change once this one should work fine Issue-ID: INT-2284 Change-Id: Ie93360e700b3d1898bed51c0612e5430d7d502cc Signed-off-by: Fiete Ostkamp --- ready.py | 121 ++++++++++++++++++++++++++++++++++------------------- version.properties | 4 +- 2 files changed, 81 insertions(+), 44 deletions(-) diff --git a/ready.py b/ready.py index a8b1999..3544b1b 100755 --- a/ready.py +++ b/ready.py @@ -433,7 +433,7 @@ USAGE = "Usage: ready.py [-t ] [-n ] -c .. def main(argv): """ - Checks if a container, pod or service is ready, + Checks if a container, pod or service is ready, if a job is finished or if the main container of a job has completed. The check is done according to the name of the container op pod, not the name of its parent (Job, Deployment, StatefulSet, DaemonSet). @@ -452,8 +452,9 @@ def main(argv): timeout = DEF_TIMEOUT url = DEF_URL ns = "" + interval=None try: - opts, _args = getopt.getopt(argv, "hj:s:c:p:a:t:m:u:n:", ["service-name=", + opts, _args = getopt.getopt(argv, "hj:s:c:p:a:t:m:u:n:i:", ["service-name=", "container-name=", "pod-name=", "app-name=", @@ -461,7 +462,8 @@ def main(argv): "service-mesh-check=", "url=", "job-name=", - "namespace=" + "namespace=", + "interval=" "help"]) for opt, arg in opts: if opt in ("-h", "--help"): @@ -485,6 +487,8 @@ def main(argv): ns = arg elif opt in ("-t", "--timeout"): timeout = float(arg) + elif opt in ("-i", "--interval"): + interval = int(arg) except (getopt.GetoptError, ValueError) as exc: print("Error parsing input parameters: {}\n".format(exc)) print(USAGE) @@ -501,95 +505,128 @@ def main(argv): else: namespace = ns - for service_name in service_names: + check_service_readiness(service_names, timeout, interval) + check_container_readiness(container_names, timeout, interval) + check_pod_readiness(pod_names, timeout, interval) + check_app_readiness(app_names, timeout, interval) + check_job_readiness(job_names, timeout, interval) + check_service_mesh_job_readiness(service_mesh_job_container_names, timeout, url) + +def check_service_mesh_job_readiness(service_mesh_job_container_names, timeout, url): + for service_mesh_job_container_name in service_mesh_job_container_names: timeout = time.time() + timeout * 60 while True: - ready = is_service_ready(service_name) + ready = service_mesh_job_check(service_mesh_job_container_name) if ready is True: + sideCarKilled = quitquitquit_post(url) + if sideCarKilled is True: + log.info("Side Car Killed through QuitQuitQuit API") + else: + log.info("Side Car Failed to be Killed through QuitQuitQuit API") break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - service_name) + service_mesh_job_container_name) sys.exit(1) else: # spread in time potentially parallel execution in multiple # containers - time.sleep(random.randint(5, 11)) - for container_name in container_names: + time.sleep(random.randint(2, 6)) + +def check_job_readiness(job_names, timeout, interval=None): + for job_name in job_names: timeout = time.time() + timeout * 60 while True: - ready = is_ready(container_name) + ready = is_job_complete(job_name) if ready is True: break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - container_name) + job_name) sys.exit(1) else: - # spread in time potentially parallel execution in multiple - # containers - time.sleep(random.randint(5, 11)) - for pod_name in pod_names: + if interval != None: + time.sleep(interval) + else: + # spread in time potentially parallel execution in multiple + # containers + time.sleep(random.randint(2, 6)) + +def check_app_readiness(app_names, timeout, interval=None): + for app_name in app_names: timeout = time.time() + timeout * 60 while True: - ready = is_pod_ready(pod_name) + ready = is_app_ready(app_name) if ready is True: break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - pod_name) + app_name) sys.exit(1) else: - # spread in time potentially parallel execution in multiple - # containers - time.sleep(random.randint(5, 11)) - for app_name in app_names: + if interval != None: + time.sleep(interval) + else: + # spread in time potentially parallel execution in multiple + # containers + time.sleep(random.randint(2, 6)) + +def check_pod_readiness(pod_names, timeout, interval=None): + for pod_name in pod_names: timeout = time.time() + timeout * 60 while True: - ready = is_app_ready(app_name) + ready = is_pod_ready(pod_name) if ready is True: break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - app_name) + pod_name) sys.exit(1) else: - # spread in time potentially parallel execution in multiple - # containers - time.sleep(random.randint(5, 11)) - for job_name in job_names: + if interval != None: + time.sleep(interval) + else: + # spread in time potentially parallel execution in multiple + # containers + time.sleep(random.randint(2, 6)) + +def check_container_readiness(container_names, timeout, interval=None): + for container_name in container_names: timeout = time.time() + timeout * 60 while True: - ready = is_job_complete(job_name) + ready = is_ready(container_name) if ready is True: break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - job_name) + container_name) sys.exit(1) else: - # spread in time potentially parallel execution in multiple - # containers - time.sleep(random.randint(5, 11)) - for service_mesh_job_container_name in service_mesh_job_container_names: + if interval != None: + time.sleep(interval) + else: + # spread in time potentially parallel execution in multiple + # containers + time.sleep(random.randint(2, 6)) + +def check_service_readiness(service_names, timeout, interval=None): + for service_name in service_names: timeout = time.time() + timeout * 60 while True: - ready = service_mesh_job_check(service_mesh_job_container_name) + ready = is_service_ready(service_name) if ready is True: - sideCarKilled = quitquitquit_post(url) - if sideCarKilled is True: - log.info("Side Car Killed through QuitQuitQuit API") - else: - log.info("Side Car Failed to be Killed through QuitQuitQuit API") break if time.time() > timeout: log.warning("timed out waiting for '%s' to be ready", - service_mesh_job_container_name) + service_name) sys.exit(1) else: - # spread in time potentially parallel execution in multiple - # containers - time.sleep(random.randint(5, 11)) + if interval != None: + time.sleep(interval) + else: + # spread in time potentially parallel execution in multiple + # containers + time.sleep(random.randint(2, 6)) if __name__ == "__main__": main(sys.argv[1:]) diff --git a/version.properties b/version.properties index 9603e41..01ff003 100644 --- a/version.properties +++ b/version.properties @@ -2,8 +2,8 @@ # Note that these variables cannot be structured (e.g. : version.release or version.snapshot etc... ) # because they are used in Jenkins, whose plug-in doesn't support -major=3 -minor=0 +major=6 +minor=1 patch=0 base_version=${major}.${minor}.${patch} -- cgit 1.2.3-korg