2.4.2 policy-handler - fixed race on step-timer

- fixed the bug of unpredictably stopping of the periodic catch-up step-timer due to thread race condition in policy-handler = added critical sections under the reentrant lock on every group of local var change in step-timer - added more stats for healthcheck to track each type of job-operation separately = that helps narrowing down identifying the potential problems - unit test coverage 76% Change-Id: I92ddf6c92a3d225d9b87427e3edfb7f80669501a Signed-off-by: Alex Shatov <alexs@att.com> Issue-ID: DCAEGEN2-472
author: Alex Shatov <alexs@att.com> 2018-04-27 11:53:55 -0400
committer: Alex Shatov <alexs@att.com> 2018-04-27 11:53:55 -0400
commit: 50bed534083c96cbf1f8fa4e220cb2b00dff9621 (patch)
tree: 8abb64c6e6bbbf7a7a2f7d9bf12cfb9b4e166f2e /policyhandler/onap/audit.py
parent: 3365431059e2de5977dae447f34a2d42dd2b039b (diff)
1 files changed, 17 insertions, 5 deletions
diff --git a/policyhandler/onap/audit.py b/policyhandler/onap/audit.py
index c615d63..08dcd37 100644
--- a/policyhandler/onap/audit.py
+++ b/policyhandler/onap/audit.py
@@ -28,6 +28,7 @@
 import copy
 import json
 import os
+import re
 import subprocess
 import sys
 import time
@@ -48,6 +49,8 @@ AUDIT_REQUESTID = 'requestID'
 AUDIT_IPADDRESS = 'IPAddress'
 AUDIT_SERVER = 'server'
 AUDIT_TARGET_ENTITY = 'targetEntity'
+AUDIT_METRICS = 'metrics'
+AUDIT_TOTAL_STATS = 'audit_total_stats'
 
 HEADER_CLIENTAUTH = "clientauth"
 HEADER_AUTHORIZATION = "authorization"
@@ -119,6 +122,7 @@ class Audit(object):
     _service_version = ""
     _service_instance_uuid = str(uuid.uuid4())
     _started = datetime.now()
+    _key_format = re.compile(r"\W")
     _logger_debug = None
     _logger_error = None
     _logger_metrics = None
@@ -160,14 +164,16 @@ class Audit(object):
             "packages" : Audit._packages
         }
 
-    def __init__(self, request_id=None, req_message=None, aud_parent=None, **kwargs):
+    def __init__(self, job_name=None, request_id=None, req_message=None, aud_parent=None, **kwargs):
         """create audit object per each request in the system
 
+        :job_name: is the name of the audit job for health stats
         :request_id: is the X-ECOMP-RequestID for tracing
         :req_message: is the request message string for logging
         :aud_parent: is the parent Audit - used for sub-query metrics to other systems
         :kwargs: - put any request related params into kwargs
         """
+        self.job_name = Audit._key_format.sub('_', job_name or req_message or Audit._service_name)
         self.request_id = request_id
         self.req_message = req_message or ""
         self.aud_parent = aud_parent
@@ -178,6 +184,8 @@ class Audit(object):
         self._lock = Lock()
 
         if self.aud_parent:
+            self.job_name = Audit._key_format.sub(
+                '_', job_name or self.aud_parent.job_name or Audit._service_name)
             if not self.request_id:
                 self.request_id = self.aud_parent.request_id
             if not self.req_message:
@@ -330,17 +338,19 @@ class Audit(object):
             self._get_response_status()
         metrics_func = None
         timer = Audit.get_elapsed_time(self._metrics_started)
+        metrics_job = Audit._key_format.sub(
+            '_', all_kwargs.get(AUDIT_TARGET_ENTITY, AUDIT_METRICS + "_" + self.job_name))
         if success:
             log_line = "done: {0}".format(log_line)
             self.info(log_line, **all_kwargs)
             metrics_func = Audit._logger_metrics.info
-            Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+            Audit._health.success(metrics_job, timer)
         else:
             log_line = "failed: {0}".format(log_line)
             self.error(log_line, errorCode=response_code.value, \
                 errorDescription=response_description, **all_kwargs)
             metrics_func = Audit._logger_metrics.error
-            Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+            Audit._health.error(metrics_job, timer)
 
         metrics_func(log_line, begTime=self._metrics_start_event, timer=timer,
                      statusCode=Audit.get_status_code(success), responseCode=response_code.value,
@@ -363,13 +373,15 @@ class Audit(object):
             log_line = "done: {0}".format(log_line)
             self.info(log_line, **all_kwargs)
             audit_func = Audit._logger_audit.info
-            Audit._health.success(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+            Audit._health.success(self.job_name, timer)
+            Audit._health.success(AUDIT_TOTAL_STATS, timer)
         else:
             log_line = "failed: {0}".format(log_line)
             self.error(log_line, errorCode=response_code.value,
                        errorDescription=response_description, **all_kwargs)
             audit_func = Audit._logger_audit.error
-            Audit._health.error(all_kwargs.get(AUDIT_TARGET_ENTITY, Audit._service_name), timer)
+            Audit._health.error(self.job_name, timer)
+            Audit._health.error(AUDIT_TOTAL_STATS, timer)
 
         audit_func(log_line, begTime=self._start_event, timer=timer,
                    statusCode=Audit.get_status_code(success),
author	Alex Shatov <alexs@att.com>	2018-04-27 11:53:55 -0400
committer	Alex Shatov <alexs@att.com>	2018-04-27 11:53:55 -0400
commit	50bed534083c96cbf1f8fa4e220cb2b00dff9621 (patch)
tree	8abb64c6e6bbbf7a7a2f7d9bf12cfb9b4e166f2e /policyhandler/onap/audit.py
parent	3365431059e2de5977dae447f34a2d42dd2b039b (diff)