aboutsummaryrefslogtreecommitdiffstats
path: root/snmptrap/mod/trapd_stormwatch.py
diff options
context:
space:
mode:
Diffstat (limited to 'snmptrap/mod/trapd_stormwatch.py')
-rw-r--r--snmptrap/mod/trapd_stormwatch.py513
1 files changed, 513 insertions, 0 deletions
diff --git a/snmptrap/mod/trapd_stormwatch.py b/snmptrap/mod/trapd_stormwatch.py
new file mode 100644
index 0000000..623fe39
--- /dev/null
+++ b/snmptrap/mod/trapd_stormwatch.py
@@ -0,0 +1,513 @@
+# ============LICENSE_START=======================================================
+# Copyright (c) 2020 AT&T Intellectual Property. All rights reserved.
+# ================================================================================
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============LICENSE_END=========================================================
+"""
+trapd_stormwatch makes the decision on whether an
+arriving SNMP trap is exceeding a pre-configured
+threshold (storm), and if so it will return "False"
+so the trap can be logged and immediately discarded
+"""
+
+__docformat__ = 'restructuredtext'
+
+import sys
+import os
+import string
+import time
+
+from trapd_io import stdout_logger, ecomp_logger
+import trapd_settings as tds
+import trapd_stats_settings as stats
+import trapd_stormwatch_settings as sws
+from trapd_exit import cleanup_and_exit
+
+prog_name = os.path.basename(__file__)
+
+
+def sw_init():
+
+ # <Storm Watch>
+ #
+ # sw_storm_counter_dict
+ # key [<ip address>.<notify oid>] -> count
+ #
+ # sw_storm_active_dict
+ # key [<ip address>.<notify oid>] -> True (or False, but no key present
+ # means False)
+ #
+ # sw_config_oid_dict
+ # key [<notify oid>] -> "true" (key presence means it participates)
+ #
+ # sw_config_low_water_in_interval_dict
+ # key [<notify oid>] -> <int> value that stormActive turns False
+ #
+ # sw_config_high_water_in_interval_dict
+ # key [<notify oid>] -> <int> value that stormActive turns True
+ #
+ sws.sw_counter_dict = {}
+ sws.sw_storm_active_dict = {}
+ sws.sw_config_oid_dict = {}
+ sws.sw_config_low_water_in_interval_dict = {}
+ sws.sw_config_high_water_in_interval_dict = {}
+ sws.sw_interval_in_seconds = 60
+
+
+# # # # # # # # # # # # #
+# fx: sw_storm_active
+# - check if storm is active for agent/oid
+# - returns True if yes, False if no
+# # # # # # # # # # # # #
+def sw_clear_dicts():
+ """
+ Clear all storm watch dictionaries
+ :Parameters:
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch count threshold
+ :Variables:
+ """
+ try:
+ stats.oid_counter_dict.clear()
+ stats.agent_counter_dict.clear()
+ sws.sw_storm_active_dict.clear()
+ sws.sw_storm_counter_dict.clear()
+ sws.sw_config_oid_dict.clear()
+ sws.sw_config_low_water_in_interval_dict.clear()
+ sws.sw_config_high_water_in_interval_dict.clear()
+ sws.sw_config_category.clear()
+ return True
+ except Exception as e:
+ msg = "unable to reset stormwatch dictionaries - results will be indeterminate: %s" % (
+ e)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ return False
+
+# # # # # # # # # # # # #
+# fx: sw_load_trap_config
+# - load trap configurations from CBS response
+# # # # # # # # # # # # #
+
+
+def sw_load_trap_config(_config):
+ """
+ Load trap configs into dictionary
+ :Parameters:
+ _config: trapd_config from CBS
+ :Exceptions:
+ """
+
+ # clear any dicts present from previous invocations
+ try:
+ sws.sw_storm_active_dict
+ ret = sw_clear_dicts()
+ msg = ("reset existing sws dictionaries to empty")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ except NameError:
+ msg = ("sws dictionaries not present - initializing")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ ret = sw_init()
+
+ # set last storm analysis to now
+ sws.sw_last_stormwatch_dict_analysis = int(time.time())
+
+ # get metric % threshold for logging trap count by-agent to metric log
+ try:
+ stats.metric_log_notification_threshold_pct = int(
+ _config["trap_config"]["metric_log_notification_threshold_pct"])
+ msg = ("metric_log_notification_threshold_pct value: %d" %
+ stats.metric_log_notification_threshold_pct)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ except Exception as e:
+ msg = (
+ "metric_log_notification_threshold_pct not present in config - default to 25")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ stats.metric_log_notification_threshold_pct = 25
+
+ # get stormwatch interval; default to 60 seconds
+ try:
+ sws.sw_interval_in_seconds = int(
+ _config["trap_config"]["sw_interval_in_seconds"])
+ msg = ("sw_interval_in_seconds value: %d" % sws.sw_interval_in_seconds)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ except Exception as e:
+ msg = ("sw_interval_in_seconds not present in config - default to 60 seconds")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ sws.sw_interval_in_seconds = 60
+
+ # add trap configs from CBS json structure to running config
+ try:
+ notify_oids = _config["trap_config"]["notify_oids"]
+ except Exception as e:
+ msg = ("no trap_config or notify_oids defined")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ return False
+
+ trap_block_counter = 0
+ for trap_block in notify_oids:
+ # oid
+ try:
+ _oid = trap_block['oid']
+ except Exception as e:
+ msg = (
+ "missing oid value in notify_oids - oid section of CBS config - using empty value, disregard entry")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+ _oid = None
+
+ # sw_high_water_in_interval
+ try:
+ _sw_high_water_in_interval = int(
+ trap_block['sw_high_water_in_interval'])
+ except Exception as e:
+ msg = (
+ "missing sw_high_water_in_interval value in notify_oids - oid section of CBS config - using empty value")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+ _sw_high_water_in_interval = None
+
+ # sw_low_water_in_interval
+ try:
+ _sw_low_water_in_interval = int(
+ trap_block['sw_low_water_in_interval'])
+ except Exception as e:
+ msg = (
+ "missing sw_low_water_in_interval value in notify_oids - oid section of CBS config - using empty value")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+ _sw_low_water_in_interval = None
+
+ # category
+ try:
+ _category = trap_block['category']
+ except Exception as e:
+ msg = (
+ "missing category value in notify_oids - oid section of CBS config - using empty value")
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+ _category = None
+
+ if (_oid is not None and _category is not None and
+ _sw_low_water_in_interval is not None and _sw_high_water_in_interval is not None and
+ _sw_low_water_in_interval < _sw_high_water_in_interval):
+ # FMDL: Do we actually need sw_config_oid_dict?
+ msg = ("oid: %s sw_high_water_in_interval: %d sw_low_water_in_interval: %d category: %s" % (
+ _oid, _sw_high_water_in_interval, _sw_low_water_in_interval, _category))
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ sws.sw_config_oid_dict[_oid] = True
+ sws.sw_config_low_water_in_interval_dict[_oid] = _sw_low_water_in_interval
+ sws.sw_config_high_water_in_interval_dict[_oid] = _sw_high_water_in_interval
+ sws.sw_config_category[_oid] = _category
+ trap_block_counter += 1
+ else:
+ msg = ("Missing or incorrect value for stormwatch config entry %d: skipping: %s" % (
+ trap_block_counter, trap_block))
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+
+
+ return trap_block_counter
+
+
+# # # # # # # # # # # # #
+# fx: sw_log_metrics
+# - log stats for any count in interval that is > sw_metric_log_notification_threshold_pct % of total arriving traps
+# # # # # # # # # # # # #
+def sw_log_metrics():
+ """
+ Log counts for agents that exceed sw_metric_log_notification_threshold_pct % of
+ total traps that arrived in interval
+ :Parameters:
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch metrics
+ :Variables:
+ """
+
+ msg = "total notifications: %d, interval in seconds: %d" % (
+ stats.total_notifications, sws.sw_interval_in_seconds)
+ ecomp_logger(tds.LOG_TYPE_METRICS, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+
+ # print metrics for total traps and traps-per-second avg
+ # during sample interval
+ avg_traps_per_second = stats.total_notifications / 60
+ msg = "total traps: %d, interval in seconds: %d, average traps-per-second: %d" % (
+ stats.total_notifications, sws.sw_interval_in_seconds, avg_traps_per_second)
+ ecomp_logger(tds.LOG_TYPE_METRICS, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+
+ # print metrics for any agent that represents more than stats.metric_log_notification_threshold_pct
+ # during sample interval
+ for k in stats.agent_counter_dict:
+ c = stats.agent_counter_dict[k]
+ p = c / stats.total_notifications * 100
+ if p > stats.metric_log_notification_threshold_pct:
+ msg = "agent: %s, notifications: %d, interval in seconds: %d, percent of total traps: %d" % (
+ k, c, sws.sw_interval_in_seconds, p)
+ ecomp_logger(tds.LOG_TYPE_METRICS, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+
+# # # # # # # # # # # # #
+# fx: stats_increment_counters
+# - increment dictionary counters that are accumulating
+# - total traps by OID and agent for each sample interval
+# # # # # # # # # # # # #
+
+
+def stats_increment_counters(_loc_agent, _loc_oid):
+ """
+ update counters tracking traps-per-interval by
+ OID and agent
+ :Parameters:
+ _loc_agent
+ agent address from trap PDU
+ _loc_oid
+ notify OID from trap PDU
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch stats metrics
+ :Variables:
+ """
+ # increment oid occurances in window
+ msg = "increment metric counters for %s %s" % (_loc_agent, _loc_oid)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ try:
+ stats.total_notifications += 1
+ except Exception as e:
+ stats.total_notifications = 1
+
+ try:
+ stats.oid_counter_dict[_loc_oid] += 1
+ except Exception as e:
+ stats.oid_counter_dict[_loc_oid] = 1
+
+ # increment agent occurances in window
+ try:
+ stats.agent_counter_dict[_loc_agent] += 1
+ except Exception as e:
+ stats.agent_counter_dict[_loc_agent] = 1
+
+
+# # # # # # # # # # # # #
+# fx: sw_storm_active
+# - check if storm is active for agent/oid
+# - returns True if yes, False if no
+# # # # # # # # # # # # #
+def sw_storm_active(_loc_agent, _loc_oid):
+ """
+ Check if this event is currently in an
+ active storm state.
+ :Parameters:
+ _loc_agent
+ agent address from trap PDU
+ _loc_oid
+ notify OID from trap PDU
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch count threshold
+ :Variables:
+ """
+
+ # we have to come here for every arriving trap, so increment
+ # trap counter dictionaries while we are here
+ stats_increment_counters(_loc_agent, _loc_oid)
+
+ # if we are at or above stormwatch interval, re-eval and re-set
+ elapsed_time = int(time.time()) - sws.sw_last_stormwatch_dict_analysis
+ if elapsed_time >= sws.sw_interval_in_seconds:
+ msg = "%d seconds has elapsed since stormwatch dictionary eval (%d second threshold) - check and reset counters " % (
+ elapsed_time, sws.sw_interval_in_seconds)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ sw_log_metrics()
+ sw_reset_counter_dict()
+
+ # Note: If there's a sw_config_high_water_in_interval config value present
+ # that means it's participating in stormwatch, otherwise bail out
+ try:
+ _high_water_val = sws.sw_config_high_water_in_interval_dict[_loc_oid]
+ msg = "%s present in stormwatch config - high water value: %d" % (
+ _loc_oid, _high_water_val)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ except Exception as e:
+ return False
+
+ # build stormwatch dict key by appending agent IP and trap oid
+ _dict_key = _loc_agent + " " + _loc_oid
+
+ # increment traps encountered for agent/oid
+ sw_increment_counter(_dict_key)
+
+ # first check if storm is active for _dict_key (early bail-out if so)
+ msg = "check if stormWatch is active for %s" % (_dict_key)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ if sws.sw_storm_active_dict.get(_dict_key) is not None:
+ msg = "stormWatch is active for %s - return true" % (_dict_key)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+ return True
+ else:
+ msg = "no stormWatch active entry for %s - continue" % (_dict_key)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO, tds.CODE_GENERAL, msg)
+
+ # if we got this far, trap is in stormwatch configs, we've incremented
+ # counter in sw_storm_counter_dict - figure out if we are over limit
+ if sws.sw_storm_counter_dict[_dict_key] > _high_water_val:
+ _loc_agent = _dict_key.split()[0]
+ _loc_oid = _dict_key.split()[1]
+ msg = "STORM ACTIVE: received %d events (%s) from %s (greater than high water threshold: %d)" % (
+ sws.sw_storm_counter_dict[_dict_key], _loc_oid, _loc_agent, _high_water_val)
+ ecomp_logger(tds.LOG_TYPE_AUDIT, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ try:
+ sws.sw_storm_active_dict[_dict_key] = True
+ except Exception as e:
+ msg = "ERROR setting %s in storm active state: %s " % (
+ _dict_key, e)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_ERROR,
+ tds.CODE_GENERAL, msg)
+ return True
+ else:
+ return False
+
+# # # # # # # # # # # # #
+# fx: sw_reset_counter_dict
+# - reset counter dictionary on <interval> boundaries
+# # # # # # # # # # # # #
+
+
+def sw_reset_counter_dict():
+ """
+ Create new storm_active_dict based on quantities received during
+ last sample interval
+ :Parameters:
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch count threshold
+ :Variables:
+ """
+
+ # <stats>
+ # publish stats to MR...
+ try:
+ msg = "publish counts-by-oid from stats.oid_counter_dict"
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ msg = "publish count-by-agent from stats.agent_counter_dict"
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ except Exception as e:
+ msg = "unable to publish counts by oid and agent to MR: " % (e)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+
+ # ...and now reset stats counters to start next interval
+ try:
+ stats.oid_counter_dict.clear()
+ stats.agent_counter_dict.clear()
+ stats.total_notifications = 0
+ except Exception as e:
+ msg = "unable to reset counts by oid and agent dictionaries - stats will be INNACURATE: " % (
+ e)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_WARN, tds.CODE_GENERAL, msg)
+ # </stats>
+
+ # <storm watch active>
+
+ # for k in sws.sw_storm_active_dict:
+ # File "./snmptrapd.py", line 642, in notif_receiver_cb
+ # if stormwatch.sw_storm_active(tds.trap_dict["agent address"], tds.trap_dict["notify OID"]):
+ # File "/opt/app/snmptrap/bin/mod/trapd_stormwatch.py", line 299, in sw_storm_active
+ # sw_reset_counter_dict()
+ # File "/opt/app/snmptrap/bin/mod/trapd_stormwatch.py", line 381, in sw_reset_counter_dict
+ # for k in sws.sw_storm_active_dict:
+ # RuntimeError: dictionary changed size during iteration
+ # FIXME: changed to "for k in list(sw_storm_active_dict)" as explained here:
+ # see https://stackoverflow.com/questions/20418851/delete-an-entry-from-a-dictionary-python
+
+ for k in list(sws.sw_storm_active_dict):
+ _loc_agent = k.split()[0]
+ _loc_oid = k.split()[1]
+
+ _high_water_val = sws.sw_config_high_water_in_interval_dict[_loc_oid]
+
+ if sws.sw_storm_counter_dict[k] >= _high_water_val:
+ msg = "%s remaining in storm state, received %d events (GE to upper threshold: %d)" % (
+ k, sws.sw_storm_counter_dict[k], _high_water_val)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ sws.sw_storm_counter_dict[k] = 0
+ else:
+ _low_water_val = sws.sw_config_low_water_in_interval_dict[_loc_oid]
+ if sws.sw_storm_counter_dict[k] < _low_water_val:
+ try:
+ msg = "STORM OVER: received %d events (%s) from %s (less than low water threshold: %d)" % (
+ sws.sw_storm_counter_dict[k], _loc_oid, _loc_agent, _low_water_val)
+ ecomp_logger(tds.LOG_TYPE_AUDIT, tds.SEV_WARN,
+ tds.CODE_GENERAL, msg)
+ del sws.sw_storm_active_dict[k]
+ sws.sw_storm_counter_dict[k] = 0
+ except Exception as e:
+ msg = "unable to remove %s from storm active dictionary - TRAPS MAY BE DISCARDED UNINTENTIONALLY! Reason: %s " % (
+ k, e)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_ERROR,
+ tds.CODE_GENERAL, msg)
+ else:
+ msg = "%s remaining in storm state, received %d events (GE to lower threshold: %d)" % (
+ k, sws.sw_storm_counter_dict[k], _low_water_val)
+ ecomp_logger(tds.LOG_TYPE_ERROR, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ sws.sw_storm_counter_dict[k] = 0
+
+ sws.sw_last_stormwatch_dict_analysis = int(time.time())
+
+ return True
+
+# # # # # # # # # # # # #
+# fx: sw_increment_counter
+# - increment OID and agent trap counters
+# based on arriving trap attributes
+# # # # # # # # # # # # #
+
+
+def sw_increment_counter(_dict_key):
+ """
+ Add to appropriate counter based on arriving trap
+ agent and OID
+ :Parameters:
+ _dict_key
+ agent address from trap PDU and notify OID
+ trap PDU, separated by a space
+ :Exceptions:
+ none
+ :Keywords:
+ stormwatch count threshold
+ :Variables:
+ """
+
+ try:
+ sws.sw_storm_counter_dict[_dict_key] += 1
+ msg = "stormwatch counter for %s now: %d" % (
+ _dict_key, sws.sw_storm_counter_dict[_dict_key])
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ return True
+ except Exception as E:
+ msg = "first trap for %s - init stormwatch counter to 1" % (_dict_key)
+ ecomp_logger(tds.LOG_TYPE_DEBUG, tds.SEV_INFO,
+ tds.CODE_GENERAL, msg)
+ sws.sw_storm_counter_dict[_dict_key] = 1
+ return True