aboutsummaryrefslogtreecommitdiffstats
path: root/policyhandler
diff options
context:
space:
mode:
authorAlex Shatov <alexs@att.com>2018-04-27 11:53:55 -0400
committerAlex Shatov <alexs@att.com>2018-04-27 11:53:55 -0400
commit50bed534083c96cbf1f8fa4e220cb2b00dff9621 (patch)
tree8abb64c6e6bbbf7a7a2f7d9bf12cfb9b4e166f2e /policyhandler
parent3365431059e2de5977dae447f34a2d42dd2b039b (diff)
2.4.2 policy-handler - fixed race on step-timer
- fixed the bug of unpredictably stopping of the periodic catch-up step-timer due to thread race condition in policy-handler = added critical sections under the reentrant lock on every group of local var change in step-timer - added more stats for healthcheck to track each type of job-operation separately = that helps narrowing down identifying the potential problems - unit test coverage 76% Change-Id: I92ddf6c92a3d225d9b87427e3edfb7f80669501a Signed-off-by: Alex Shatov <alexs@att.com> Issue-ID: DCAEGEN2-472
Diffstat (limited to 'policyhandler')
-rw-r--r--policyhandler/onap/audit.py22
-rw-r--r--policyhandler/policy_receiver.py5
-rw-r--r--policyhandler/policy_rest.py17
-rw-r--r--policyhandler/policy_updater.py12
-rw-r--r--policyhandler/step_timer.py121
-rw-r--r--policyhandler/web_server.py16
6 files changed, 118 insertions, 75 deletions
diff --git a/policyhandler/onap/audit.py b/policyhandler/onap/audit.py
index c615d63..08dcd37 100644
--- a/policyhandler/onap/audit.py
+++ b/policyhandler/onap/audit.py
@@ -28,6 +28,7 @@
import copy
import json
import os
+import re
import subprocess
import sys
import time
@@ -48,6 +49,8 @@ AUDIT_REQUESTID = 'requestID'
AUDIT_IPADDRESS = 'IPAddress'
AUDIT_SERVER = 'server'
AUDIT_TARGET_ENTITY = 'targetEntity'
+AUDIT_METRICS = 'metrics'
+AUDIT_TOTAL_STATS = 'audit_total_stats'
HEADER_CLIENTAUTH = "clientauth"
HEADER_AUTHORIZATION = "authorization"
@@ -119,6 +122,7 @@ class Audit(object):
_service_version = ""
_service_instance_uuid = str(uuid.uuid4())
_started = datetime.now()
+ _key_format = re.compile(r"\W")
_logger_debug = None
_logger_error = None
_logger_metrics = None
@@ -160,14 +164,16 @@ class Audit(object):
"packages" : Audit._packages
}
- def __init__(self, request_id=None, req_message=None, aud_parent=None, **kwargs):
+ def __init__(self, job_name=None, request_id=None, req_message=None, aud_parent=None, **kwargs):
"""create audit object per each request in the system
+ :job_name: is the name of the audit job for health stats
:request_id: is the X-ECOMP-RequestID for tracing
:req_message: is the request message string for logging
:aud_parent: is the parent Audit - used for sub-query metrics to other systems
:kwargs: - put any request related params into kwargs
"""
+ self.job_name = Audit._key_format.sub('_', job_name or req_message or Audit._service_name)
self.request_id = request_id
self.req_message = req_message or ""
self.aud_parent = aud_parent
@@ -178,6 +184,8 @@ class Audit(object):
self._lock = Lock()
if self.aud_parent:
+ self.job_name = Audit._key_format.sub(
+ '_', job_name or self.aud_parent.job_name or Audit._service_name)
if not self.request_id:
self.request_id = self.aud_parent.request_id
if not self.req_message:
@@ -330,17 +338,19 @@ class Audit(object):
self._get_response_status()
metrics_func = None
timer = Audit.get_elapsed_time(self._metrics_started)
+ metrics_job = Audit._key_format.sub(
+ '_', all_kwargs.get(AUDIT_TARGET_ENTITY, AUDIT_METRICS + "_" + self.job_name))
if success:
log_line = "done: {0}".format(log_line)
self.info(log_line, **all_kwargs)
metrics_func = Audit._logger_metrics.info
- Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+ Audit._health.success(metrics_job, timer)
else:
log_line = "failed: {0}".format(log_line)
self.error(log_line, errorCode=response_code.value, \
errorDescription=response_description, **all_kwargs)
metrics_func = Audit._logger_metrics.error
- Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+ Audit._health.error(metrics_job, timer)
metrics_func(log_line, begTime=self._metrics_start_event, timer=timer,
statusCode=Audit.get_status_code(success), responseCode=response_code.value,
@@ -363,13 +373,15 @@ class Audit(object):
log_line = "done: {0}".format(log_line)
self.info(log_line, **all_kwargs)
audit_func = Audit._logger_audit.info
- Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+ Audit._health.success(self.job_name, timer)
+ Audit._health.success(AUDIT_TOTAL_STATS, timer)
else:
log_line = "failed: {0}".format(log_line)
self.error(log_line, errorCode=response_code.value,
errorDescription=response_description, **all_kwargs)
audit_func = Audit._logger_audit.error
- Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+ Audit._health.error(self.job_name, timer)
+ Audit._health.error(AUDIT_TOTAL_STATS, timer)
audit_func(log_line, begTime=self._start_event, timer=timer,
statusCode=Audit.get_status_code(success),
diff --git a/policyhandler/policy_receiver.py b/policyhandler/policy_receiver.py
index 843dbd8..751bea8 100644
--- a/policyhandler/policy_receiver.py
+++ b/policyhandler/policy_receiver.py
@@ -136,8 +136,9 @@ class _PolicyReceiver(Thread):
)
return
- audit = Audit(req_message="policy-notification - updated[{0}], removed[{1}]" \
- .format(len(policies_updated), len(policies_removed)))
+ audit = Audit(job_name="policy_update",
+ req_message="policy-notification - updated[{0}], removed[{1}]"
+ .format(len(policies_updated), len(policies_removed)))
audit.retry_get_config = True
self._policy_updater.enqueue(audit, policies_updated, policies_removed)
diff --git a/policyhandler/policy_rest.py b/policyhandler/policy_rest.py
index 4eee154..0c8920a 100644
--- a/policyhandler/policy_rest.py
+++ b/policyhandler/policy_rest.py
@@ -406,18 +406,26 @@ class PolicyRest(object):
result = {}
aud_policy_filters = None
+ str_policy_filters = None
str_metrics = None
- str_policy_filters = json.dumps(policy_filter or PolicyRest._scope_prefixes)
+ target_entity = None
+
if policy_filter is not None:
aud_policy_filters = [(audit, policy_filter, None)]
+ str_policy_filters = json.dumps(policy_filter)
str_metrics = "get_latest_policies for policy_filter {0}".format(
str_policy_filters)
+ target_entity = ("{0} total get_latest_policies by policy_filter"
+ .format(PolicyRest._target_entity))
result[POLICY_FILTER] = copy.deepcopy(policy_filter)
else:
aud_policy_filters = [(audit, {POLICY_NAME:scope_prefix + ".*"}, scope_prefix)
for scope_prefix in PolicyRest._scope_prefixes]
+ str_policy_filters = json.dumps(PolicyRest._scope_prefixes)
str_metrics = "get_latest_policies for scopes {0} {1}".format( \
len(PolicyRest._scope_prefixes), str_policy_filters)
+ target_entity = ("{0} total get_latest_policies by scope_prefixes"
+ .format(PolicyRest._target_entity))
result[SCOPE_PREFIXES] = copy.deepcopy(PolicyRest._scope_prefixes)
PolicyRest._logger.debug("%s", str_policy_filters)
@@ -433,9 +441,10 @@ class PolicyRest(object):
pool.close()
pool.join()
- audit.metrics("total result {0}: {1} {2}".format(
- str_metrics, len(latest_policies), json.dumps(latest_policies)), \
- targetEntity=PolicyRest._target_entity, targetServiceName=PolicyRest._url_get_config)
+ audit.metrics(
+ "total result {0}: {1} {2}".format(
+ str_metrics, len(latest_policies), json.dumps(latest_policies)),
+ targetEntity=target_entity, targetServiceName=PolicyRest._url_get_config)
# latest_policies == [(valid_policies, errored_policies, errored_scope_prefix), ...]
result[LATEST_POLICIES] = dict(
diff --git a/policyhandler/policy_updater.py b/policyhandler/policy_updater.py
index 0b9e227..70823fa 100644
--- a/policyhandler/policy_updater.py
+++ b/policyhandler/policy_updater.py
@@ -47,7 +47,7 @@ class PolicyUpdater(Thread):
self._aud_shutdown = None
self._aud_catch_up = None
- catch_up_config = Config.config.get("catch_up", {})
+ catch_up_config = Config.config.get(CATCH_UP, {})
self._catch_up_interval = catch_up_config.get("interval") or 15*60
self._catch_up_max_skips = catch_up_config.get("max_skips") or 3
self._catch_up_skips = 0
@@ -120,8 +120,10 @@ class PolicyUpdater(Thread):
"""need to bring the latest policies to DCAE-Controller"""
with self._lock:
self._aud_catch_up = audit or Audit(req_message=AUTO_CATCH_UP)
- PolicyUpdater._logger.info("catch_up %s request_id %s",
- self._aud_catch_up.req_message, self._aud_catch_up.request_id)
+ PolicyUpdater._logger.info(
+ "catch_up %s request_id %s",
+ self._aud_catch_up.req_message, self._aud_catch_up.request_id
+ )
self.enqueue()
@@ -131,8 +133,8 @@ class PolicyUpdater(Thread):
return
if self._catch_up_timer:
- self._catch_up_timer.next()
self._logger.info("next step catch_up_timer in %s", self._catch_up_interval)
+ self._catch_up_timer.next()
return
self._catch_up_timer = StepTimer(
@@ -142,8 +144,8 @@ class PolicyUpdater(Thread):
PolicyUpdater._logger,
self
)
- self._catch_up_timer.start()
self._logger.info("started catch_up_timer in %s", self._catch_up_interval)
+ self._catch_up_timer.start()
def _pause_catch_up_timer(self):
"""pause catch_up_timer"""
diff --git a/policyhandler/step_timer.py b/policyhandler/step_timer.py
index ad77c85..3936107 100644
--- a/policyhandler/step_timer.py
+++ b/policyhandler/step_timer.py
@@ -19,7 +19,7 @@
"""periodically callback"""
from datetime import datetime
-from threading import Event, Lock, Thread
+from threading import Event, RLock, Thread
class StepTimer(Thread):
@@ -40,67 +40,75 @@ class StepTimer(Thread):
self._args = args
self._kwargs = kwargs
- self._lock = Lock()
+ self._lock = RLock()
self._timeout = Event()
- self._paused = Event()
+ self._waiting_for_timeout = False
self._next = Event()
- self._finished = Event()
+ self._paused = False
+ self._finished = False
- self._event = StepTimer.INIT
- self._event_counter = 0
- self._event_time = 0
- self._event_ts = datetime.now()
+ self._request = StepTimer.INIT
+ self._req_count = 0
+ self._req_time = 0
+ self._req_ts = datetime.now()
self._substep = None
self._substep_time = 0
self._substep_ts = datetime.now()
- def get_status(self):
- """returns status of events"""
+ def get_timer_status(self):
+ """returns timer status"""
with self._lock:
return "{0}[{1}] {2}: timeout({3}), paused({4}), next({5}), finished({6})".format(
- self._event,
- self._event_counter,
+ self._request,
+ self._req_count,
self._substep,
self._timeout.is_set(),
- self._paused.is_set(),
+ self._paused,
self._next.is_set(),
- self._finished.is_set(),
+ self._finished,
)
def next(self):
"""continue with the next timeout"""
- self._paused.clear()
- self._next.set()
- self._timeout.set()
- self._set_timer_event(StepTimer.NEXT)
+ with self._lock:
+ self._paused = False
+ if self._waiting_for_timeout:
+ self._next.set()
+ self._timeout.set()
+ else:
+ self._next.set()
+ self._request_to_timer(StepTimer.NEXT)
def pause(self):
"""pause the timer"""
- self._paused.set()
- self._next.clear()
- self._set_timer_event(StepTimer.PAUSED)
+ with self._lock:
+ self._paused = True
+ self._next.clear()
+ self._request_to_timer(StepTimer.PAUSED)
def stop(self):
"""stop the timer if it hasn't finished yet"""
- self._finished.set()
- self._timeout.set()
- self._next.set()
- self._set_timer_event(StepTimer.STOPPING)
+ with self._lock:
+ self._finished = True
+ self._timeout.set()
+ self._next.set()
+ self._request_to_timer(StepTimer.STOPPING)
- def _set_timer_event(self, event):
- """set the event on the timer"""
+ def _request_to_timer(self, request):
+ """set the request on the timer"""
with self._lock:
- if event in [StepTimer.NEXT, StepTimer.STARTED]:
- self._event_counter += 1
+ if request in [StepTimer.NEXT, StepTimer.STARTED]:
+ self._req_count += 1
- self._event = event
+ prev_req = self._request
+ self._request = request
now = datetime.now()
- self._event_time = (now - self._event_ts).total_seconds()
- self._event_ts = now
- self._logger.info("[{0}] {1} {2}".format(
- self._event_time, self.name, self.get_status()))
+ self._req_time = (now - self._req_ts).total_seconds()
+ self._req_ts = now
+ self._logger.info("{0}[{1}] {2}->{3}".format(
+ self.name, self._req_time, prev_req, self.get_timer_status()))
def _timer_substep(self, substep):
"""log exe step"""
@@ -109,27 +117,34 @@ class StepTimer(Thread):
now = datetime.now()
self._substep_time = (now - self._substep_ts).total_seconds()
self._substep_ts = now
- self._logger.info("[{0}] {1}".format(self._substep_time, self.get_status()))
+ self._logger.info("[{0}] {1}".format(self._substep_time, self.get_timer_status()))
def run(self):
"""loop one step a time until stopped=finished"""
- self._set_timer_event(StepTimer.STARTED)
+ self._request_to_timer(StepTimer.STARTED)
while True:
- self._timer_substep("waiting for timeout {0}...".format(self._interval))
- self._timeout.wait(self._interval)
- self._timer_substep("woke up after timeout")
+ with self._lock:
+ self._timeout.clear()
+ self._waiting_for_timeout = True
+ self._timer_substep("waiting for timeout {0}...".format(self._interval))
- if self._finished.is_set():
- self._timer_substep("finished")
- break
+ interrupted = self._timeout.wait(self._interval)
- if self._next.is_set():
- self._next.clear()
- self._timeout.clear()
- self._timer_substep("restart timer")
- continue
+ with self._lock:
+ self._waiting_for_timeout = False
+ self._timer_substep("woke up after {0}timeout"
+ .format((interrupted and "interrupted ") or ""))
+
+ if self._finished:
+ self._timer_substep("finished")
+ break
- if self._paused.is_set():
+ if self._next.is_set() and interrupted:
+ self._next.clear()
+ self._timer_substep("restart timer")
+ continue
+
+ if self._paused:
self._timer_substep("paused - skip on_time event")
else:
self._timer_substep("on_time event")
@@ -137,12 +152,12 @@ class StepTimer(Thread):
self._timer_substep("waiting for next...")
self._next.wait()
- self._next.clear()
- self._timeout.clear()
- self._timer_substep("woke up on next")
+ with self._lock:
+ self._next.clear()
+ self._timer_substep("woke up on next")
- if self._finished.is_set():
+ if self._finished:
self._timer_substep("finished")
break
- self._set_timer_event(StepTimer.STOPPED)
+ self._request_to_timer(StepTimer.STOPPED)
diff --git a/policyhandler/web_server.py b/policyhandler/web_server.py
index 20e74e3..e9cc9cc 100644
--- a/policyhandler/web_server.py
+++ b/policyhandler/web_server.py
@@ -56,7 +56,8 @@ class _PolicyWeb(object):
def policy_latest(self, policy_id):
"""retireves the latest policy identified by policy_id"""
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message=req_info, headers=cherrypy.request.headers)
+ audit = Audit(job_name="get_latest_policy",
+ req_message=req_info, headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s policy_id=%s headers=%s", \
req_info, policy_id, json.dumps(cherrypy.request.headers))
@@ -74,7 +75,8 @@ class _PolicyWeb(object):
def _get_all_policies_latest(self):
"""retireves all the latest policies on GET /policies_latest"""
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message=req_info, headers=cherrypy.request.headers)
+ audit = Audit(job_name="get_all_policies_latest",
+ req_message=req_info, headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s", req_info)
@@ -146,7 +148,8 @@ class _PolicyWeb(object):
str_policy_filter = json.dumps(policy_filter)
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message="{0}: {1}".format(req_info, str_policy_filter), \
+ audit = Audit(job_name="get_latest_policies",
+ req_message="{0}: {1}".format(req_info, str_policy_filter), \
headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s: policy_filter=%s headers=%s", \
req_info, str_policy_filter, json.dumps(cherrypy.request.headers))
@@ -168,7 +171,7 @@ class _PolicyWeb(object):
"""catch up with all DCAE policies"""
started = str(datetime.now())
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message=req_info, headers=cherrypy.request.headers)
+ audit = Audit(job_name="catch_up", req_message=req_info, headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s", req_info)
PolicyReceiver.catch_up(audit)
@@ -182,7 +185,7 @@ class _PolicyWeb(object):
def shutdown(self):
"""Shutdown the policy-handler"""
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message=req_info, headers=cherrypy.request.headers)
+ audit = Audit(job_name="shutdown", req_message=req_info, headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s: --- stopping REST API of policy-handler ---", req_info)
@@ -203,7 +206,8 @@ class _PolicyWeb(object):
def healthcheck(self):
"""returns the healthcheck results"""
req_info = _PolicyWeb._get_request_info(cherrypy.request)
- audit = Audit(req_message=req_info, headers=cherrypy.request.headers)
+ audit = Audit(job_name="healthcheck",
+ req_message=req_info, headers=cherrypy.request.headers)
PolicyWeb.logger.info("%s", req_info)