summaryrefslogtreecommitdiffstats
path: root/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules
diff options
context:
space:
mode:
authorGary Wu <gary.i.wu@huawei.com>2019-01-29 00:54:16 +0000
committerGerrit Code Review <gerrit@onap.org>2019-01-29 00:54:16 +0000
commite03be3845828032fd363b3d9b9ef83424b3428b9 (patch)
tree5c5b17268e4b4d710e936243fa3f99e8644fde32 /vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules
parent7115cfddb7ecec3cd56b548b4365fdea1783b2d7 (diff)
parentc942e55ceea4ce28e84168bb672a83572d0a6313 (diff)
Merge "Helm charts for Distributed Edge Analytics."
Diffstat (limited to 'vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules')
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml47
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml134
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml34
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml58
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml33
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml35
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml35
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml57
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml120
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml154
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml93
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml56
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml117
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml184
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml36
-rw-r--r--vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml102
16 files changed, 1295 insertions, 0 deletions
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml
new file mode 100644
index 00000000..f196db48
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml
@@ -0,0 +1,47 @@
+# Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ annotations:
+ message: The configuration of the instances of the Alertmanager cluster `{{`{{$labels.service}}`}}` are out of sync.
+ expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: AlertmanagerFailedReload
+ annotations:
+ message: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}}.
+ expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: AlertmanagerMembersInconsistent
+ annotations:
+ message: Alertmanager has not found all other members of the cluster.
+ expr: |-
+ alertmanager_cluster_members{job="{{ $alertmanagerJob }}"}
+ != on (service) GROUP_LEFT()
+ count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}"})
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml
new file mode 100644
index 00000000..dd0140db
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml
@@ -0,0 +1,134 @@
+# Generated from 'etcd' group from https://raw.githubusercontent.com/paskal/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml
+{{- if and .Values.defaultRules.create .Values.kubeEtcd.enabled }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: etcd
+ rules:
+ - alert: etcdInsufficientMembers
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": insufficient members ({{`{{ $value }}`}}).'
+ expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdNoLeader
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": member {{`{{ $labels.instance }}`}} has no leader.'
+ expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ - alert: etcdHighNumberOfLeaderChanges
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader changes within the last hour.'
+ expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 5
+ for: 5m
+ labels:
+ severity: critical
+ - alert: etcdGRPCRequestsSlow
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": gRPC requests to {{`{{ $labels.grpc_method }}`}} are taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdMemberCommunicationSlow
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": member communication with {{`{{ $labels.To }}`}} is taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedProposals
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}} proposal failures within the last hour on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile fync durations are {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighCommitDurations
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile commit durations {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.05
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdHTTPRequestsSlow
+ annotations:
+ message: etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method }}`}} are slow.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml
new file mode 100644
index 00000000..020c29eb
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml
@@ -0,0 +1,34 @@
+# Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: general.rules
+ rules:
+ - alert: TargetDown
+ annotations:
+ message: '{{`{{ $value }}`}}% of the {{`{{ $labels.job }}`}} targets are down.'
+ expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+ for: 10m
+ labels:
+ severity: warning
+ - alert: DeadMansSwitch
+ annotations:
+ message: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional.
+ expr: vector(1)
+ labels:
+ severity: none
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml
new file mode 100644
index 00000000..620bd15b
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml
@@ -0,0 +1,58 @@
+# Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: k8s.rules
+ rules:
+ - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
+ record: namespace:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by (namespace, pod_name, container_name) (
+ rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
+ )
+ record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
+ - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
+ record: namespace:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
+ * on (namespace, pod_name) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
+ * on (namespace, pod_name) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod)
+ * on (namespace, pod) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
+ * on (namespace, pod) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml
new file mode 100644
index 00000000..d1db5296
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml
@@ -0,0 +1,33 @@
+# Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create .Values.kubeApiServer.enabled }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml
new file mode 100644
index 00000000..d0a643b7
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml
@@ -0,0 +1,35 @@
+# Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-alerting.rules
+ rules:
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 24 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)'
+ for: 30m
+ labels:
+ severity: warning
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 2 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)'
+ for: 10m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml
new file mode 100644
index 00000000..87d3556a
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml
@@ -0,0 +1,35 @@
+# Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-recording.rules
+ rules:
+ - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
+ record: instance:node_cpu:rate:sum
+ - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance)
+ record: instance:node_filesystem_usage:sum
+ - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
+ record: instance:node_network_receive_bytes:rate:sum
+ - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
+ record: instance:node_network_transmit_bytes:rate:sum
+ - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
+ record: instance:node_cpu:ratio
+ - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
+ record: cluster:node_cpu:sum_rate5m
+ - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
+ record: cluster:node_cpu:ratio
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml
new file mode 100644
index 00000000..3a279661
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml
@@ -0,0 +1,57 @@
+# Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create .Values.kubeScheduler.enabled }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-scheduler.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml
new file mode 100644
index 00000000..37fc5465
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml
@@ -0,0 +1,120 @@
+# Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-absent
+ rules:
+ - alert: AlertmanagerDown
+ annotations:
+ message: Alertmanager has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
+ expr: absent(up{job="{{ $alertmanagerJob }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.kubeDns.enabled }}
+ - alert: CoreDNSDown
+ annotations:
+ message: CoreDNS has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
+ expr: absent(up{job="kube-dns"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.kubeApiServer.enabled }}
+{{- end }}
+ - alert: KubeAPIDown
+ annotations:
+ message: KubeAPI has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
+ expr: absent(up{job="apiserver"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeControllerManager.enabled }}
+ - alert: KubeControllerManagerDown
+ annotations:
+ message: KubeControllerManager has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
+ expr: absent(up{job="kube-controller-manager"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeScheduler.enabled }}
+ - alert: KubeSchedulerDown
+ annotations:
+ message: KubeScheduler has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
+ expr: absent(up{job="kube-scheduler"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeStateMetrics.enabled }}
+ - alert: KubeStateMetricsDown
+ annotations:
+ message: KubeStateMetrics has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
+ expr: absent(up{job="kube-state-metrics"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.prometheusOperator.kubeletService.enabled }}
+ - alert: KubeletDown
+ annotations:
+ message: Kubelet has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
+ expr: absent(up{job="kubelet"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.nodeExporter.enabled }}
+ - alert: NodeExporterDown
+ annotations:
+ message: NodeExporter has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
+ expr: absent(up{job="node-exporter"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+ - alert: PrometheusDown
+ annotations:
+ message: Prometheus has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
+ expr: absent(up{job="{{ $prometheusJob }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.prometheusOperator.enabled }}
+ - alert: PrometheusOperatorDown
+ annotations:
+ message: PrometheusOperator has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
+ expr: absent(up{job="{{ $operatorJob }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml
new file mode 100644
index 00000000..21549c23
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml
@@ -0,0 +1,154 @@
+# Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create .Values.kubeStateMetrics.enabled }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-apps
+ rules:
+ - alert: KubePodCrashLooping
+ annotations:
+ message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} ({{`{{ $labels.container }}`}}) is restarting {{`{{ printf "%.2f" $value }}`}} times / 5 minutes.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
+ expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubePodNotReady
+ annotations:
+ message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} has been in a non-ready state for longer than an hour.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
+ expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeDeploymentGenerationMismatch
+ annotations:
+ message: Deployment generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
+ expr: |-
+ kube_deployment_status_observed_generation{job="kube-state-metrics"}
+ !=
+ kube_deployment_metadata_generation{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDeploymentReplicasMismatch
+ annotations:
+ message: Deployment {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} has not matched the expected number of replicas for longer than an hour.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
+ expr: |-
+ kube_deployment_spec_replicas{job="kube-state-metrics"}
+ !=
+ kube_deployment_status_replicas_available{job="kube-state-metrics"}
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetReplicasMismatch
+ annotations:
+ message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} has not matched the expected number of replicas for longer than 15 minutes.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
+ expr: |-
+ kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
+ !=
+ kube_statefulset_status_replicas{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetGenerationMismatch
+ annotations:
+ message: StatefulSet generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
+ expr: |-
+ kube_statefulset_status_observed_generation{job="kube-state-metrics"}
+ !=
+ kube_statefulset_metadata_generation{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetUpdateNotRolledOut
+ annotations:
+ message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} update has not been rolled out.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
+ expr: |-
+ max without (revision) (
+ kube_statefulset_status_current_revision{job="kube-state-metrics"}
+ unless
+ kube_statefulset_status_update_revision{job="kube-state-metrics"}
+ )
+ *
+ (
+ kube_statefulset_replicas{job="kube-state-metrics"}
+ !=
+ kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetRolloutStuck
+ annotations:
+ message: Only {{`{{ $value }}`}}% of the desired Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are scheduled and ready.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
+ expr: |-
+ kube_daemonset_status_number_ready{job="kube-state-metrics"}
+ /
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetNotScheduled
+ annotations:
+ message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are not scheduled.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
+ expr: |-
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
+ -
+ kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeDaemonSetMisScheduled
+ annotations:
+ message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are running where they are not supposed to run.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
+ expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeCronJobRunning
+ annotations:
+ message: CronJob {{`{{ $labels.namespace }}`}}/{{`{{ $labels.cronjob }}`}} is taking more than 1h to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
+ expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobCompletion
+ annotations:
+ message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} is taking more than one hour to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
+ expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobFailed
+ annotations:
+ message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} failed to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
+ expr: kube_job_status_failed{job="kube-state-metrics"} > 0
+ for: 1h
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml
new file mode 100644
index 00000000..4a7b9f95
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml
@@ -0,0 +1,93 @@
+# Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-resources
+ rules:
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
+ /
+ sum(node:node_num_cpu:sum)
+ >
+ (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
+ /
+ sum(node_memory_MemTotal_bytes)
+ >
+ (count(node:node_num_cpu:sum)-1)
+ /
+ count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Namespaces.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
+ /
+ sum(node:node_num_cpu:sum)
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Namespaces.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeQuotaExceeded
+ annotations:
+ message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
+ expr: |-
+ 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
+ / ignoring(instance, job, type)
+ (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+ > 90
+ for: 15m
+ labels:
+ severity: warning
+ - alert: CPUThrottlingHigh
+ annotations:
+ message: '{{`{{ printf "%0.0f" $value }}`}}% throttling of CPU in namespace {{`{{ $labels.namespace }}`}} for container {{`{{ $labels.container_name }}`}} in pod {{`{{ $labels.pod_name }}`}}.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
+ expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25"
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml
new file mode 100644
index 00000000..d290f0cf
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml
@@ -0,0 +1,56 @@
+# Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-storage
+ rules:
+ - alert: KubePersistentVolumeUsageCritical
+ annotations:
+ message: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ printf "%0.2f" $value }}`}}% free.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
+ expr: |-
+ 100 * kubelet_volume_stats_available_bytes{job="kubelet"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet"}
+ < 3
+ for: 1m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeFullInFourDays
+ annotations:
+ message: Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up within four days. Currently {{`{{ printf "%0.2f" $value }}`}}% is available.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
+ expr: |-
+ 100 * (
+ kubelet_volume_stats_available_bytes{job="kubelet"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet"}
+ ) < 15
+ and
+ predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
+ for: 5m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeErrors
+ annotations:
+ message: The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml
new file mode 100644
index 00000000..78f90b79
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml
@@ -0,0 +1,117 @@
+# Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system
+ rules:
+ - alert: KubeNodeNotReady
+ annotations:
+ message: '{{`{{ $labels.node }}`}} has been unready for more than an hour.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
+ expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeVersionMismatch
+ annotations:
+ message: There are {{`{{ $value }}`}} different versions of Kubernetes components running.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
+ expr: count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}}% errors.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
+ expr: |-
+ (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
+ /
+ sum(rate(rest_client_requests_total[5m])) by (instance, job))
+ * 100 > 1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}} errors / second.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
+ expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeletTooManyPods
+ annotations:
+ message: Kubelet {{`{{ $labels.instance }}`}} is running {{`{{ $value }}`}} Pods, close to the limit of 110.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
+ expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{ $value }}`}}% of requests.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{ $value }}`}}% of requests.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: Kubernetes API certificate is expiring in less than 7 days.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
+ expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: Kubernetes API certificate is expiring in less than 24 hours.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
+ expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml
new file mode 100644
index 00000000..b039d1ac
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml
@@ -0,0 +1,184 @@
+# Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create .Values.nodeExporter.enabled }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node.rules
+ rules:
+ - expr: sum(min(kube_pod_info) by (node))
+ record: ':kube_pod_info_node_count:'
+ - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
+ record: 'node_namespace_pod:kube_pod_info:'
+ - expr: |-
+ count by (node) (sum by (node, cpu) (
+ node_cpu_seconds_total{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ ))
+ record: node:node_num_cpu:sum
+ - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
+ record: :node_cpu_utilisation:avg1m
+ - expr: |-
+ 1 - avg by (node) (
+ rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:)
+ record: node:node_cpu_utilisation:avg1m
+ - expr: |-
+ sum(node_load1{job="node-exporter"})
+ /
+ sum(node:node_num_cpu:sum)
+ record: ':node_cpu_saturation_load1:'
+ - expr: |-
+ sum by (node) (
+ node_load1{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ node:node_num_cpu:sum
+ record: 'node:node_cpu_saturation_load1:'
+ - expr: |-
+ 1 -
+ sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: ':node_memory_utilisation:'
+ - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ record: :node_memory_MemFreeCachedBuffers_bytes:sum
+ - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: :node_memory_MemTotal_bytes:sum
+ - expr: |-
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_available:sum
+ - expr: |-
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_total:sum
+ - expr: |-
+ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
+ /
+ scalar(sum(node:node_memory_bytes_total:sum))
+ record: node:node_memory_utilisation:ratio
+ - expr: |-
+ 1e3 * sum(
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ )
+ record: :node_memory_swap_io_bytes:sum_rate
+ - expr: |-
+ 1 -
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: 'node:node_memory_utilisation:'
+ - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
+ record: 'node:node_memory_utilisation_2:'
+ - expr: |-
+ 1e3 * sum by (node) (
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_swap_io_bytes:sum_rate
+ - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
+ record: :node_disk_utilisation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_utilisation:avg_irate
+ - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
+ record: :node_disk_saturation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_saturation:avg_irate
+ - expr: |-
+ max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
+ - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_usage:'
+ - expr: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_avail:'
+ - expr: |-
+ sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
+ sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
+ record: :node_net_utilisation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
+ irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_utilisation:sum_irate
+ - expr: |-
+ sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
+ sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
+ record: :node_net_saturation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
+ irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_saturation:sum_irate
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_total:'
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_free:'
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml
new file mode 100644
index 00000000..7f19763e
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml
@@ -0,0 +1,36 @@
+# Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus-operator
+ rules:
+ - alert: PrometheusOperatorReconcileErrors
+ annotations:
+ message: Errors while reconciling {{`{{ $labels.controller }}`}} in {{`{{ $labels.namespace }}`}} Namespace.
+ expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusOperatorNodeLookupErrors
+ annotations:
+ message: Errors while reconciling Prometheus in {{`{{ $labels.namespace }}`}} Namespace.
+ expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml
new file mode 100644
index 00000000..e2d8a68d
--- /dev/null
+++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml
@@ -0,0 +1,102 @@
+# Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+{{- if and .Values.defaultRules.create }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus.rules
+ rules:
+ - alert: PrometheusConfigReloadFailed
+ annotations:
+ description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
+ summary: Reloading Prometheus' configuration failed
+ expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusNotificationQueueRunningFull
+ annotations:
+ description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
+ summary: Prometheus' alert notification queue is running full
+ expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"}
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
+ summary: Errors while sending alert from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
+ summary: Errors while sending alerts from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03
+ for: 10m
+ labels:
+ severity: critical
+ - alert: PrometheusNotConnectedToAlertmanagers
+ annotations:
+ description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
+ summary: Prometheus is not connected to any Alertmanagers
+ expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBReloadsFailing
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
+ summary: Prometheus has issues reloading data blocks from disk
+ expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBCompactionsFailing
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
+ summary: Prometheus has issues compacting sample blocks
+ expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBWALCorruptions
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
+ summary: Prometheus write-ahead log is corrupted
+ expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0
+ for: 4h
+ labels:
+ severity: warning
+ - alert: PrometheusNotIngestingSamples
+ annotations:
+ description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
+ summary: Prometheus isn't ingesting samples
+ expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTargetScrapesDuplicate
+ annotations:
+ description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
+ summary: Prometheus has many samples rejected
+ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file