diff options
author | Kevin McKiou <km097d@att.com> | 2017-12-13 15:26:59 -0600 |
---|---|---|
committer | Kevin McKiou <km097d@att.com> | 2017-12-13 15:27:31 -0600 |
commit | bc8c8286645f74753d175eee7ca62d989555c96c (patch) | |
tree | a94d177a5e6ef1908cb56f1153864737b7b42c15 /feature-active-standby-management/src/test/resources | |
parent | 4884099eff44975eee57e6748823ff73f965e332 (diff) |
Fix issues blocking election handler thread
This bug tracks the AT&T bug 355533. The symptom was that drools
pdps that were backing each other up were becoming stuck in a
standby state. The cause was that the election handler thread was
being hung by a call to PolicyEngine.manager.deactivate which shuts
down the topic endpoints when the drools pdp operational state
transitions to disabled. Related problems were that the election
handler heartbeat was NOT blocked when the main thread was blocked
and the IntegrityMonitor forward progress counter was NOT blocked
from incrementing when the election handler thread was blocked.
This prevented the correct failover of the drools pdp to another
healthy one. This change fixes the two causes of the thread blockage,
moves the election handler heartbeat to the main thread and adds an
interface (AllSeemsWell) which is called when the election handler
has stalled/resumed. The AllSeemsWell interface will block forward
progress counter increments when ALLNOTWELL and will resume forward
progress counter increments when ALLSEEMSWELL. In addition, it reduces
the run time of the StandbyStateManagementTest from approximately 8
minutes to approximately 2 minutes. Since this changes classes also
changed by POLICY-444, this change must be merged before POLICY-444
can be merged.
Issue-ID: POLICY-501
Change-Id: I7b8180d11077ccf59b21b6484cb58b5522a3df8f
Signed-off-by: Kevin McKiou <km097d@att.com>
Diffstat (limited to 'feature-active-standby-management/src/test/resources')
4 files changed, 129 insertions, 10 deletions
diff --git a/feature-active-standby-management/src/test/resources/asw/feature-active-standby-management.properties b/feature-active-standby-management/src/test/resources/asw/feature-active-standby-management.properties new file mode 100644 index 00000000..f0711e6c --- /dev/null +++ b/feature-active-standby-management/src/test/resources/asw/feature-active-standby-management.properties @@ -0,0 +1,39 @@ +### +# ============LICENSE_START======================================================= +# feature-active-standby-management +# ================================================================================ +# Copyright (C) 2017 AT&T Intellectual Property. All rights reserved. +# ================================================================================ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============LICENSE_END========================================================= +### + +# DB properties +javax.persistence.jdbc.driver = org.h2.Driver +javax.persistence.jdbc.url = jdbc:h2:file:./sql/activestandbymanagement +javax.persistence.jdbc.user = sa +javax.persistence.jdbc.password = + +# Must be unique across the system +resource.name=pdp1 +# Name of the site in which this node is hosted +site_name=pdp_1 + +# Needed by DroolsPdpsElectionHandler +pdp.checkInterval=1500 +pdp.updateInterval=1000 + +# Need long timeout, because testTransaction is only run every 1 seconds. +pdp.timeout=3000 +#how long do we wait for the pdp table to populate on initial startup +pdp.initialWait=1000
\ No newline at end of file diff --git a/feature-active-standby-management/src/test/resources/asw/feature-state-management.properties b/feature-active-standby-management/src/test/resources/asw/feature-state-management.properties new file mode 100644 index 00000000..2629c63d --- /dev/null +++ b/feature-active-standby-management/src/test/resources/asw/feature-state-management.properties @@ -0,0 +1,81 @@ +### +# ============LICENSE_START======================================================= +# feature-active-standby-management +# ================================================================================ +# Copyright (C) 2017 AT&T Intellectual Property. All rights reserved. +# ================================================================================ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============LICENSE_END========================================================= +### + +# DB properties +javax.persistence.jdbc.driver = org.h2.Driver +javax.persistence.jdbc.url = jdbc:h2:file:./sql/statemanagement +javax.persistence.jdbc.user = sa +javax.persistence.jdbc.password = + +# DroolsPDPIntegrityMonitor Properties + +http.server.services=TEST +http.server.services.TEST.host=0.0.0.0 +http.server.services.TEST.port=9981 +#These properties will default to the following if no other values are provided: +# http.server.services.TEST.restClasses=org.onap.policy.drools.statemanagement.IntegrityMonitorRestManager +# http.server.services.TEST.managed=false +# http.server.services.TEST.swagger=true + +#IntegrityMonitor Properties + +# Must be unique across the system +resource.name=pdp1 +# Name of the site in which this node is hosted +site_name = pdp_1 +# How often in sec the forward progress monitor checks for forward progress +fp_monitor_interval = 2 +# Failed counter threshold before failover +failed_counter_threshold = 1 +# Interval between test transactions when no traffic seconds +test_trans_interval = 1 +# Interval between writes of the FPC to the DB seconds +write_fpc_interval = 1 +# Node type Note: Make sure you don't leave any trailing spaces, or you'll get an 'invalid node type' error! +node_type = pdp_drools +# Dependency groups are groups of resources upon which a node operational state is dependent upon. +# Each group is a comma-separated list of resource names and groups are separated by a semicolon. For example: +# dependency_groups=site_1.astra_1,site_1.astra_2;site_1.brms_1,site_1.brms_2;site_1.logparser_1;site_1.pypdp_1 +dependency_groups= +# When set to true, dependent health checks are performed by using JMX to invoke test() on the dependent. +# The default false is to use state checks for health. +test_via_jmx=true +# This is the max number of seconds beyond which a non incrementing FPC is considered a failure +max_fpc_update_interval=5 +# Run the state audit every 60 seconds (60000 ms). The state audit finds stale DB entries in the +# forwardprogressentity table and marks the node as disabled/failed in the statemanagemententity +# table. NOTE! It will only run on nodes that have a standbystatus = providingservice. +# A value of <= 0 will turn off the state audit. +state_audit_interval_ms= -1 +# The refresh state audit is run every (default) 10 minutes (600000 ms) to clean up any state corruption in the +# DB statemanagemententity table. It only refreshes the DB state entry for the local node. That is, it does not +# refresh the state of any other nodes. A value <= 0 will turn the audit off. Any other value will override +# the default of 600000 ms. +refresh_state_audit_interval_ms=-1 + + +# Repository audit properties +# Flag to control the execution of the subsystemTest for the Nexus Maven repository +repository.audit.is.active=false +repository.audit.ignore.errors=true + +# DB Audit Properties +# Flag to control the execution of the subsystemTest for the Database +db.audit.is.active=false diff --git a/feature-active-standby-management/src/test/resources/feature-active-standby-management.properties b/feature-active-standby-management/src/test/resources/feature-active-standby-management.properties index bbae5d98..827d2e17 100644 --- a/feature-active-standby-management/src/test/resources/feature-active-standby-management.properties +++ b/feature-active-standby-management/src/test/resources/feature-active-standby-management.properties @@ -32,8 +32,7 @@ site_name=pdp_1 # Needed by DroolsPdpsElectionHandler pdp.checkInterval=1500 pdp.updateInterval=1000 -#pdp.timeout=3000 # Need long timeout, because testTransaction is only run every 10 seconds. -pdp.timeout=15000 +pdp.timeout=3000 #how long do we wait for the pdp table to populate on initial startup -pdp.initialWait=20000
\ No newline at end of file +pdp.initialWait=1000
\ No newline at end of file diff --git a/feature-active-standby-management/src/test/resources/feature-state-management.properties b/feature-active-standby-management/src/test/resources/feature-state-management.properties index 7856d251..3dd88473 100644 --- a/feature-active-standby-management/src/test/resources/feature-state-management.properties +++ b/feature-active-standby-management/src/test/resources/feature-state-management.properties @@ -40,13 +40,13 @@ resource.name=pdp1 # Name of the site in which this node is hosted site_name = pdp_1 # Forward Progress Monitor update interval seconds -fp_monitor_interval = 30 +fp_monitor_interval = 2 # Failed counter threshold before failover -failed_counter_threshold = 3 +failed_counter_threshold = 1 # Interval between test transactions when no traffic seconds -test_trans_interval = 10 +test_trans_interval = 1 # Interval between writes of the FPC to the DB seconds -write_fpc_interval = 5 +write_fpc_interval = 1 # Node type Note: Make sure you don't leave any trailing spaces, or you'll get an 'invalid node type' error! node_type = pdp_drools # Dependency groups are groups of resources upon which a node operational state is dependent upon. @@ -57,17 +57,17 @@ dependency_groups= # The default false is to use state checks for health. test_via_jmx=true # This is the max number of seconds beyond which a non incrementing FPC is considered a failure -max_fpc_update_interval=120 +max_fpc_update_interval=5 # Run the state audit every 60 seconds (60000 ms). The state audit finds stale DB entries in the # forwardprogressentity table and marks the node as disabled/failed in the statemanagemententity # table. NOTE! It will only run on nodes that have a standbystatus = providingservice. # A value of <= 0 will turn off the state audit. -state_audit_interval_ms=60000 +state_audit_interval_ms=-1 # The refresh state audit is run every (default) 10 minutes (600000 ms) to clean up any state corruption in the # DB statemanagemententity table. It only refreshes the DB state entry for the local node. That is, it does not # refresh the state of any other nodes. A value <= 0 will turn the audit off. Any other value will override # the default of 600000 ms. -refresh_state_audit_interval_ms=600000 +refresh_state_audit_interval_ms=-1 # Repository audit properties |