Fix issues blocking election handler thread

This bug tracks the AT&T bug 355533. The symptom was that drools pdps that were backing each other up were becoming stuck in a standby state. The cause was that the election handler thread was being hung by a call to PolicyEngine.manager.deactivate which shuts down the topic endpoints when the drools pdp operational state transitions to disabled. Related problems were that the election handler heartbeat was NOT blocked when the main thread was blocked and the IntegrityMonitor forward progress counter was NOT blocked from incrementing when the election handler thread was blocked. This prevented the correct failover of the drools pdp to another healthy one. This change fixes the two causes of the thread blockage, moves the election handler heartbeat to the main thread and adds an interface (AllSeemsWell) which is called when the election handler has stalled/resumed. The AllSeemsWell interface will block forward progress counter increments when ALLNOTWELL and will resume forward progress counter increments when ALLSEEMSWELL. In addition, it reduces the run time of the StandbyStateManagementTest from approximately 8 minutes to approximately 2 minutes. Since this changes classes also changed by POLICY-444, this change must be merged before POLICY-444 can be merged. Issue-ID: POLICY-501 Change-Id: I7b8180d11077ccf59b21b6484cb58b5522a3df8f Signed-off-by: Kevin McKiou <km097d@att.com>
author: Kevin McKiou <km097d@att.com> 2017-12-13 15:26:59 -0600
committer: Kevin McKiou <km097d@att.com> 2017-12-13 15:27:31 -0600
commit: bc8c8286645f74753d175eee7ca62d989555c96c (patch)
tree: a94d177a5e6ef1908cb56f1153864737b7b42c15 /feature-active-standby-management/src/test/resources/feature-state-management.properties
parent: 4884099eff44975eee57e6748823ff73f965e332 (diff)
1 files changed, 7 insertions, 7 deletions
diff --git a/feature-active-standby-management/src/test/resources/feature-state-management.properties b/feature-active-standby-management/src/test/resources/feature-state-management.properties
index 7856d251..3dd88473 100644
--- a/feature-active-standby-management/src/test/resources/feature-state-management.properties
+++ b/feature-active-standby-management/src/test/resources/feature-state-management.properties
@@ -40,13 +40,13 @@ resource.name=pdp1
 # Name of the site in which this node is hosted 
 site_name = pdp_1
 # Forward Progress Monitor update interval seconds
-fp_monitor_interval = 30
+fp_monitor_interval = 2
 # Failed counter threshold before failover 
-failed_counter_threshold = 3
+failed_counter_threshold = 1
 # Interval between test transactions when no traffic seconds
-test_trans_interval = 10
+test_trans_interval = 1
 # Interval between writes of the FPC to the DB seconds 
-write_fpc_interval = 5
+write_fpc_interval = 1
 # Node type Note: Make sure you don't leave any trailing spaces, or you'll get an 'invalid node type' error! 
 node_type = pdp_drools
 # Dependency groups are groups of resources upon which a node operational state is dependent upon. 
@@ -57,17 +57,17 @@ dependency_groups=
 # The default false is to use state checks for health.
 test_via_jmx=true
 # This is the max number of seconds beyond which a non incrementing FPC is considered a failure
-max_fpc_update_interval=120
+max_fpc_update_interval=5
 # Run the state audit every 60 seconds (60000 ms).  The state audit finds stale DB entries in the 
 # forwardprogressentity table and marks the node as disabled/failed in the statemanagemententity 
 # table. NOTE! It will only run on nodes that have a standbystatus = providingservice.
 # A value of <= 0 will turn off the state audit.
-state_audit_interval_ms=60000
+state_audit_interval_ms=-1
 # The refresh state audit is run every (default) 10 minutes (600000 ms) to clean up any state corruption in the 
 # DB statemanagemententity table. It only refreshes the DB state entry for the local node.  That is, it does not
 # refresh the state of any other nodes.  A value <= 0 will turn the audit off. Any other value will override 
 # the default of 600000 ms.
-refresh_state_audit_interval_ms=600000
+refresh_state_audit_interval_ms=-1
 
 
 # Repository audit properties
author	Kevin McKiou <km097d@att.com>	2017-12-13 15:26:59 -0600
committer	Kevin McKiou <km097d@att.com>	2017-12-13 15:27:31 -0600
commit	bc8c8286645f74753d175eee7ca62d989555c96c (patch)
tree	a94d177a5e6ef1908cb56f1153864737b7b42c15 /feature-active-standby-management/src/test/resources/feature-state-management.properties
parent	4884099eff44975eee57e6748823ff73f965e332 (diff)