From 4ec33c40c749435be7a11b0a794d29d3d6d616dc Mon Sep 17 00:00:00 2001 From: Temoc Rodriguez Date: Mon, 6 Nov 2017 13:29:30 -0800 Subject: Fix bug when IntegrityMonitor runs subsystem late Changes to dependencyCheck to correctly take into account subsystemTest on the first pass through dependencies. Allows for status of repo to remain failed in the case that it fails. Issue-ID: POLICY-431 Change-Id: I23ce43ce41c546edad73ec8055e513ccc61933b0 Signed-off-by: Temoc Rodriguez --- .../onap/policy/common/im/IntegrityMonitor.java | 71 +++++++++++----------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/integrity-monitor/src/main/java/org/onap/policy/common/im/IntegrityMonitor.java b/integrity-monitor/src/main/java/org/onap/policy/common/im/IntegrityMonitor.java index ceb6d695..93cee577 100644 --- a/integrity-monitor/src/main/java/org/onap/policy/common/im/IntegrityMonitor.java +++ b/integrity-monitor/src/main/java/org/onap/policy/common/im/IntegrityMonitor.java @@ -815,7 +815,6 @@ public class IntegrityMonitor { } catch (Exception e) { error_msg = dep + ": resource sanity test failed with exception: "; logger.error("{}", error_msg, e); - // TODO: extract real error message from exception which may be nested } finally { // close the JMX connector if (jmxAgentConnection != null) { @@ -837,6 +836,35 @@ public class IntegrityMonitor { String error_msg = ""; boolean dependencyFailure = false; + /* + * Before we check dependency groups we need to check subsystemTest. + */ + try { + //Test any subsystems that are not covered under the dependency relationship + subsystemTest(); + }catch (Exception e){ + logger.error("IntegrityMonitor threw exception", e); + dependencyFailure=true; + //This indicates a subsystemTest failure + try { + if(logger.isDebugEnabled()){ + logger.debug("{}: There has been a subsystemTest failure with error:{} Updating this resource's state to disableDependency", resourceName, e.getMessage()); + } + //Capture the subsystemTest failure info + if(!error_msg.isEmpty()){ + error_msg = error_msg.concat(","); + } + error_msg = error_msg.concat(resourceName + ": " + e.getMessage()); + this.stateManager.disableDependency(); + } catch (Exception ex) { + logger.error("IntegrityMonitor threw exception.", ex); + if (!error_msg.isEmpty()) { + error_msg = error_msg.concat(","); + } + error_msg = error_msg.concat("\n" + resourceName + ": Failed to disable dependency after subsystemTest failure due to: " + ex.getMessage()); + } + } + // Check the sanity of dependents for lead subcomponents if (dep_groups != null && dep_groups.length > 0) { @@ -906,8 +934,10 @@ public class IntegrityMonitor { }//end for (String group : dep_groups) + /* - * We have checked all the dependency groups. If all are ok, dependencyFailure == false + * We have checked all the dependency groups. If all are ok and subsystemTest passed, + * dependencyFailure == false */ if(!dependencyFailure){ try { @@ -927,7 +957,7 @@ public class IntegrityMonitor { error_msg = error_msg.concat(resourceName + ": Failed to enable no dependency"); } } - }else{ + }else if(!dependencyFailure){ /* * This is put here to clean up when no dependency group should exist, but one was erroneously * added which caused the state to be disabled/dependency/coldstandby and later removed. We saw @@ -951,35 +981,7 @@ public class IntegrityMonitor { } } - /* - * We have checked dependency groups and if there were none, we set enableNoDependency. If there were some - * but they are all ok, we set enableNoDependency. So, the recovery from a disabled dependency state - * is handled above. We only need to set disableDependency if the subsystemTest fails. - */ - try { - //Test any subsystems that are not covered under the dependency relationship - subsystemTest(); - }catch (Exception e){ - logger.error("IntegrityMonitor threw exception", e); - //This indicates a subsystemTest failure - try { - if(logger.isDebugEnabled()){ - logger.debug("{}: There has been a subsystemTest failure with error:{} Updating this resource's state to disableDependency", resourceName, e.getMessage()); - } - //Capture the subsystemTest failure info - if(!error_msg.isEmpty()){ - error_msg = error_msg.concat(","); - } - error_msg = error_msg.concat(resourceName + ": " + e.getMessage()); - this.stateManager.disableDependency(); - } catch (Exception ex) { - logger.error("IntegrityMonitor threw exception.", ex); - if (!error_msg.isEmpty()) { - error_msg = error_msg.concat(","); - } - error_msg = error_msg.concat("\n" + resourceName + ": Failed to disable dependency after subsystemTest failure due to: " + ex.getMessage()); - } - } + if (!error_msg.isEmpty()) { logger.error("Sanity failure detected in a dependent resource: {}", error_msg); @@ -1311,8 +1313,6 @@ public class IntegrityMonitor { elapsedTime = 0; // reset elapsed time - // TODO: check if alarm exists - try { if (fpCounter == lastFpCounter) { // no forward progress @@ -1325,7 +1325,6 @@ public class IntegrityMonitor { // Note: The refreshStateAudit will make redundant calls stateManager.disableFailed(); }// The refreshStateAudit will catch the case where opStat = disabled and availState ! failed/dependency.failed - // TODO: raise alarm or Nagios alert alarmExists = true; } } else { @@ -1340,8 +1339,6 @@ public class IntegrityMonitor { // Note: The refreshStateAudit will make redundant calls stateManager.enableNotFailed(); }// The refreshStateAudit will catch the case where opState=enabled and availStatus != null - - // TODO: clear alarm or Nagios alert alarmExists = false; } } catch (Exception e) { -- cgit 1.2.3-korg