diff options
author | Alex Shatov <alexs@att.com> | 2018-04-27 11:53:55 -0400 |
---|---|---|
committer | Alex Shatov <alexs@att.com> | 2018-04-27 11:53:55 -0400 |
commit | 50bed534083c96cbf1f8fa4e220cb2b00dff9621 (patch) | |
tree | 8abb64c6e6bbbf7a7a2f7d9bf12cfb9b4e166f2e /policyhandler | |
parent | 3365431059e2de5977dae447f34a2d42dd2b039b (diff) |
2.4.2 policy-handler - fixed race on step-timer
- fixed the bug of unpredictably stopping of the periodic catch-up
step-timer due to thread race condition in policy-handler
= added critical sections under the reentrant lock on every group
of local var change in step-timer
- added more stats for healthcheck to track each type of
job-operation separately
= that helps narrowing down identifying the potential problems
- unit test coverage 76%
Change-Id: I92ddf6c92a3d225d9b87427e3edfb7f80669501a
Signed-off-by: Alex Shatov <alexs@att.com>
Issue-ID: DCAEGEN2-472
Diffstat (limited to 'policyhandler')
-rw-r--r-- | policyhandler/onap/audit.py | 22 | ||||
-rw-r--r-- | policyhandler/policy_receiver.py | 5 | ||||
-rw-r--r-- | policyhandler/policy_rest.py | 17 | ||||
-rw-r--r-- | policyhandler/policy_updater.py | 12 | ||||
-rw-r--r-- | policyhandler/step_timer.py | 121 | ||||
-rw-r--r-- | policyhandler/web_server.py | 16 |
6 files changed, 118 insertions, 75 deletions
diff --git a/policyhandler/onap/audit.py b/policyhandler/onap/audit.py index c615d63..08dcd37 100644 --- a/policyhandler/onap/audit.py +++ b/policyhandler/onap/audit.py @@ -28,6 +28,7 @@ import copy import json import os +import re import subprocess import sys import time @@ -48,6 +49,8 @@ AUDIT_REQUESTID = 'requestID' AUDIT_IPADDRESS = 'IPAddress' AUDIT_SERVER = 'server' AUDIT_TARGET_ENTITY = 'targetEntity' +AUDIT_METRICS = 'metrics' +AUDIT_TOTAL_STATS = 'audit_total_stats' HEADER_CLIENTAUTH = "clientauth" HEADER_AUTHORIZATION = "authorization" @@ -119,6 +122,7 @@ class Audit(object): _service_version = "" _service_instance_uuid = str(uuid.uuid4()) _started = datetime.now() + _key_format = re.compile(r"\W") _logger_debug = None _logger_error = None _logger_metrics = None @@ -160,14 +164,16 @@ class Audit(object): "packages" : Audit._packages } - def __init__(self, request_id=None, req_message=None, aud_parent=None, **kwargs): + def __init__(self, job_name=None, request_id=None, req_message=None, aud_parent=None, **kwargs): """create audit object per each request in the system + :job_name: is the name of the audit job for health stats :request_id: is the X-ECOMP-RequestID for tracing :req_message: is the request message string for logging :aud_parent: is the parent Audit - used for sub-query metrics to other systems :kwargs: - put any request related params into kwargs """ + self.job_name = Audit._key_format.sub('_', job_name or req_message or Audit._service_name) self.request_id = request_id self.req_message = req_message or "" self.aud_parent = aud_parent @@ -178,6 +184,8 @@ class Audit(object): self._lock = Lock() if self.aud_parent: + self.job_name = Audit._key_format.sub( + '_', job_name or self.aud_parent.job_name or Audit._service_name) if not self.request_id: self.request_id = self.aud_parent.request_id if not self.req_message: @@ -330,17 +338,19 @@ class Audit(object): self._get_response_status() metrics_func = None timer = Audit.get_elapsed_time(self._metrics_started) + metrics_job = Audit._key_format.sub( + '_', all_kwargs.get(AUDIT_TARGET_ENTITY, AUDIT_METRICS + "_" + self.job_name)) if success: log_line = "done: {0}".format(log_line) self.info(log_line, **all_kwargs) metrics_func = Audit._logger_metrics.info - Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer) + Audit._health.success(metrics_job, timer) else: log_line = "failed: {0}".format(log_line) self.error(log_line, errorCode=response_code.value, \ errorDescription=response_description, **all_kwargs) metrics_func = Audit._logger_metrics.error - Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer) + Audit._health.error(metrics_job, timer) metrics_func(log_line, begTime=self._metrics_start_event, timer=timer, statusCode=Audit.get_status_code(success), responseCode=response_code.value, @@ -363,13 +373,15 @@ class Audit(object): log_line = "done: {0}".format(log_line) self.info(log_line, **all_kwargs) audit_func = Audit._logger_audit.info - Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer) + Audit._health.success(self.job_name, timer) + Audit._health.success(AUDIT_TOTAL_STATS, timer) else: log_line = "failed: {0}".format(log_line) self.error(log_line, errorCode=response_code.value, errorDescription=response_description, **all_kwargs) audit_func = Audit._logger_audit.error - Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer) + Audit._health.error(self.job_name, timer) + Audit._health.error(AUDIT_TOTAL_STATS, timer) audit_func(log_line, begTime=self._start_event, timer=timer, statusCode=Audit.get_status_code(success), diff --git a/policyhandler/policy_receiver.py b/policyhandler/policy_receiver.py index 843dbd8..751bea8 100644 --- a/policyhandler/policy_receiver.py +++ b/policyhandler/policy_receiver.py @@ -136,8 +136,9 @@ class _PolicyReceiver(Thread): ) return - audit = Audit(req_message="policy-notification - updated[{0}], removed[{1}]" \ - .format(len(policies_updated), len(policies_removed))) + audit = Audit(job_name="policy_update", + req_message="policy-notification - updated[{0}], removed[{1}]" + .format(len(policies_updated), len(policies_removed))) audit.retry_get_config = True self._policy_updater.enqueue(audit, policies_updated, policies_removed) diff --git a/policyhandler/policy_rest.py b/policyhandler/policy_rest.py index 4eee154..0c8920a 100644 --- a/policyhandler/policy_rest.py +++ b/policyhandler/policy_rest.py @@ -406,18 +406,26 @@ class PolicyRest(object): result = {} aud_policy_filters = None + str_policy_filters = None str_metrics = None - str_policy_filters = json.dumps(policy_filter or PolicyRest._scope_prefixes) + target_entity = None + if policy_filter is not None: aud_policy_filters = [(audit, policy_filter, None)] + str_policy_filters = json.dumps(policy_filter) str_metrics = "get_latest_policies for policy_filter {0}".format( str_policy_filters) + target_entity = ("{0} total get_latest_policies by policy_filter" + .format(PolicyRest._target_entity)) result[POLICY_FILTER] = copy.deepcopy(policy_filter) else: aud_policy_filters = [(audit, {POLICY_NAME:scope_prefix + ".*"}, scope_prefix) for scope_prefix in PolicyRest._scope_prefixes] + str_policy_filters = json.dumps(PolicyRest._scope_prefixes) str_metrics = "get_latest_policies for scopes {0} {1}".format( \ len(PolicyRest._scope_prefixes), str_policy_filters) + target_entity = ("{0} total get_latest_policies by scope_prefixes" + .format(PolicyRest._target_entity)) result[SCOPE_PREFIXES] = copy.deepcopy(PolicyRest._scope_prefixes) PolicyRest._logger.debug("%s", str_policy_filters) @@ -433,9 +441,10 @@ class PolicyRest(object): pool.close() pool.join() - audit.metrics("total result {0}: {1} {2}".format( - str_metrics, len(latest_policies), json.dumps(latest_policies)), \ - targetEntity=PolicyRest._target_entity, targetServiceName=PolicyRest._url_get_config) + audit.metrics( + "total result {0}: {1} {2}".format( + str_metrics, len(latest_policies), json.dumps(latest_policies)), + targetEntity=target_entity, targetServiceName=PolicyRest._url_get_config) # latest_policies == [(valid_policies, errored_policies, errored_scope_prefix), ...] result[LATEST_POLICIES] = dict( diff --git a/policyhandler/policy_updater.py b/policyhandler/policy_updater.py index 0b9e227..70823fa 100644 --- a/policyhandler/policy_updater.py +++ b/policyhandler/policy_updater.py @@ -47,7 +47,7 @@ class PolicyUpdater(Thread): self._aud_shutdown = None self._aud_catch_up = None - catch_up_config = Config.config.get("catch_up", {}) + catch_up_config = Config.config.get(CATCH_UP, {}) self._catch_up_interval = catch_up_config.get("interval") or 15*60 self._catch_up_max_skips = catch_up_config.get("max_skips") or 3 self._catch_up_skips = 0 @@ -120,8 +120,10 @@ class PolicyUpdater(Thread): """need to bring the latest policies to DCAE-Controller""" with self._lock: self._aud_catch_up = audit or Audit(req_message=AUTO_CATCH_UP) - PolicyUpdater._logger.info("catch_up %s request_id %s", - self._aud_catch_up.req_message, self._aud_catch_up.request_id) + PolicyUpdater._logger.info( + "catch_up %s request_id %s", + self._aud_catch_up.req_message, self._aud_catch_up.request_id + ) self.enqueue() @@ -131,8 +133,8 @@ class PolicyUpdater(Thread): return if self._catch_up_timer: - self._catch_up_timer.next() self._logger.info("next step catch_up_timer in %s", self._catch_up_interval) + self._catch_up_timer.next() return self._catch_up_timer = StepTimer( @@ -142,8 +144,8 @@ class PolicyUpdater(Thread): PolicyUpdater._logger, self ) - self._catch_up_timer.start() self._logger.info("started catch_up_timer in %s", self._catch_up_interval) + self._catch_up_timer.start() def _pause_catch_up_timer(self): """pause catch_up_timer""" diff --git a/policyhandler/step_timer.py b/policyhandler/step_timer.py index ad77c85..3936107 100644 --- a/policyhandler/step_timer.py +++ b/policyhandler/step_timer.py @@ -19,7 +19,7 @@ """periodically callback""" from datetime import datetime -from threading import Event, Lock, Thread +from threading import Event, RLock, Thread class StepTimer(Thread): @@ -40,67 +40,75 @@ class StepTimer(Thread): self._args = args self._kwargs = kwargs - self._lock = Lock() + self._lock = RLock() self._timeout = Event() - self._paused = Event() + self._waiting_for_timeout = False self._next = Event() - self._finished = Event() + self._paused = False + self._finished = False - self._event = StepTimer.INIT - self._event_counter = 0 - self._event_time = 0 - self._event_ts = datetime.now() + self._request = StepTimer.INIT + self._req_count = 0 + self._req_time = 0 + self._req_ts = datetime.now() self._substep = None self._substep_time = 0 self._substep_ts = datetime.now() - def get_status(self): - """returns status of events""" + def get_timer_status(self): + """returns timer status""" with self._lock: return "{0}[{1}] {2}: timeout({3}), paused({4}), next({5}), finished({6})".format( - self._event, - self._event_counter, + self._request, + self._req_count, self._substep, self._timeout.is_set(), - self._paused.is_set(), + self._paused, self._next.is_set(), - self._finished.is_set(), + self._finished, ) def next(self): """continue with the next timeout""" - self._paused.clear() - self._next.set() - self._timeout.set() - self._set_timer_event(StepTimer.NEXT) + with self._lock: + self._paused = False + if self._waiting_for_timeout: + self._next.set() + self._timeout.set() + else: + self._next.set() + self._request_to_timer(StepTimer.NEXT) def pause(self): """pause the timer""" - self._paused.set() - self._next.clear() - self._set_timer_event(StepTimer.PAUSED) + with self._lock: + self._paused = True + self._next.clear() + self._request_to_timer(StepTimer.PAUSED) def stop(self): """stop the timer if it hasn't finished yet""" - self._finished.set() - self._timeout.set() - self._next.set() - self._set_timer_event(StepTimer.STOPPING) + with self._lock: + self._finished = True + self._timeout.set() + self._next.set() + self._request_to_timer(StepTimer.STOPPING) - def _set_timer_event(self, event): - """set the event on the timer""" + def _request_to_timer(self, request): + """set the request on the timer""" with self._lock: - if event in [StepTimer.NEXT, StepTimer.STARTED]: - self._event_counter += 1 + if request in [StepTimer.NEXT, StepTimer.STARTED]: + self._req_count += 1 - self._event = event + prev_req = self._request + self._request = request now = datetime.now() - self._event_time = (now - self._event_ts).total_seconds() - self._event_ts = now - self._logger.info("[{0}] {1} {2}".format( - self._event_time, self.name, self.get_status())) + self._req_time = (now - self._req_ts).total_seconds() + self._req_ts = now + self._logger.info("{0}[{1}] {2}->{3}".format( + self.name, self._req_time, prev_req, self.get_timer_status())) def _timer_substep(self, substep): """log exe step""" @@ -109,27 +117,34 @@ class StepTimer(Thread): now = datetime.now() self._substep_time = (now - self._substep_ts).total_seconds() self._substep_ts = now - self._logger.info("[{0}] {1}".format(self._substep_time, self.get_status())) + self._logger.info("[{0}] {1}".format(self._substep_time, self.get_timer_status())) def run(self): """loop one step a time until stopped=finished""" - self._set_timer_event(StepTimer.STARTED) + self._request_to_timer(StepTimer.STARTED) while True: - self._timer_substep("waiting for timeout {0}...".format(self._interval)) - self._timeout.wait(self._interval) - self._timer_substep("woke up after timeout") + with self._lock: + self._timeout.clear() + self._waiting_for_timeout = True + self._timer_substep("waiting for timeout {0}...".format(self._interval)) - if self._finished.is_set(): - self._timer_substep("finished") - break + interrupted = self._timeout.wait(self._interval) - if self._next.is_set(): - self._next.clear() - self._timeout.clear() - self._timer_substep("restart timer") - continue + with self._lock: + self._waiting_for_timeout = False + self._timer_substep("woke up after {0}timeout" + .format((interrupted and "interrupted ") or "")) + + if self._finished: + self._timer_substep("finished") + break - if self._paused.is_set(): + if self._next.is_set() and interrupted: + self._next.clear() + self._timer_substep("restart timer") + continue + + if self._paused: self._timer_substep("paused - skip on_time event") else: self._timer_substep("on_time event") @@ -137,12 +152,12 @@ class StepTimer(Thread): self._timer_substep("waiting for next...") self._next.wait() - self._next.clear() - self._timeout.clear() - self._timer_substep("woke up on next") + with self._lock: + self._next.clear() + self._timer_substep("woke up on next") - if self._finished.is_set(): + if self._finished: self._timer_substep("finished") break - self._set_timer_event(StepTimer.STOPPED) + self._request_to_timer(StepTimer.STOPPED) diff --git a/policyhandler/web_server.py b/policyhandler/web_server.py index 20e74e3..e9cc9cc 100644 --- a/policyhandler/web_server.py +++ b/policyhandler/web_server.py @@ -56,7 +56,8 @@ class _PolicyWeb(object): def policy_latest(self, policy_id): """retireves the latest policy identified by policy_id""" req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message=req_info, headers=cherrypy.request.headers) + audit = Audit(job_name="get_latest_policy", + req_message=req_info, headers=cherrypy.request.headers) PolicyWeb.logger.info("%s policy_id=%s headers=%s", \ req_info, policy_id, json.dumps(cherrypy.request.headers)) @@ -74,7 +75,8 @@ class _PolicyWeb(object): def _get_all_policies_latest(self): """retireves all the latest policies on GET /policies_latest""" req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message=req_info, headers=cherrypy.request.headers) + audit = Audit(job_name="get_all_policies_latest", + req_message=req_info, headers=cherrypy.request.headers) PolicyWeb.logger.info("%s", req_info) @@ -146,7 +148,8 @@ class _PolicyWeb(object): str_policy_filter = json.dumps(policy_filter) req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message="{0}: {1}".format(req_info, str_policy_filter), \ + audit = Audit(job_name="get_latest_policies", + req_message="{0}: {1}".format(req_info, str_policy_filter), \ headers=cherrypy.request.headers) PolicyWeb.logger.info("%s: policy_filter=%s headers=%s", \ req_info, str_policy_filter, json.dumps(cherrypy.request.headers)) @@ -168,7 +171,7 @@ class _PolicyWeb(object): """catch up with all DCAE policies""" started = str(datetime.now()) req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message=req_info, headers=cherrypy.request.headers) + audit = Audit(job_name="catch_up", req_message=req_info, headers=cherrypy.request.headers) PolicyWeb.logger.info("%s", req_info) PolicyReceiver.catch_up(audit) @@ -182,7 +185,7 @@ class _PolicyWeb(object): def shutdown(self): """Shutdown the policy-handler""" req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message=req_info, headers=cherrypy.request.headers) + audit = Audit(job_name="shutdown", req_message=req_info, headers=cherrypy.request.headers) PolicyWeb.logger.info("%s: --- stopping REST API of policy-handler ---", req_info) @@ -203,7 +206,8 @@ class _PolicyWeb(object): def healthcheck(self): """returns the healthcheck results""" req_info = _PolicyWeb._get_request_info(cherrypy.request) - audit = Audit(req_message=req_info, headers=cherrypy.request.headers) + audit = Audit(job_name="healthcheck", + req_message=req_info, headers=cherrypy.request.headers) PolicyWeb.logger.info("%s", req_info) |