aboutsummaryrefslogtreecommitdiffstats
path: root/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules
diff options
context:
space:
mode:
authorDileep Ranganathan <dileep.ranganathan@intel.com>2019-03-05 10:24:06 -0800
committerDileep Ranganathan <dileep.ranganathan@intel.com>2019-03-05 10:38:48 -0800
commit3dfd3180c0a4d192f4524d74e36d2ba50bffff71 (patch)
tree7df49d15b185b73af9a902b17323e5fba46b208f /vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules
parent1b81e8f0b51576f761aa8e3329285bfb61e6dd79 (diff)
Collection Service Helm charts package
The packages needed for distributed analytics are separated as collection, messaging, training, inference and visualization. Collection package consists of collection agents, Prometheus operator. and Prometheus. Change-Id: I12c6ed0607fbaedf7bbc207562fb5bf2a1950623 Issue-ID: ONAPARC-366 Signed-off-by: Dileep Ranganathan <dileep.ranganathan@intel.com>
Diffstat (limited to 'vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules')
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml50
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/etcd.yaml136
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/general.rules.yaml46
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml60
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml35
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml37
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml37
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml59
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml123
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml156
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml99
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml58
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml119
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/node.rules.yaml198
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml39
-rw-r--r--vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml105
16 files changed, 1357 insertions, 0 deletions
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml
new file mode 100644
index 00000000..c1762fd1
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml
@@ -0,0 +1,50 @@
+# Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+{{- $namespace := .Release.Namespace }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ annotations:
+ message: The configuration of the instances of the Alertmanager cluster `{{`{{$labels.service}}`}}` are out of sync.
+ expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}, "service", "$1", "name", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: AlertmanagerFailedReload
+ annotations:
+ message: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}}.
+ expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: AlertmanagerMembersInconsistent
+ annotations:
+ message: Alertmanager has not found all other members of the cluster.
+ expr: |-
+ alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
+ != on (service) GROUP_LEFT()
+ count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/etcd.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/etcd.yaml
new file mode 100644
index 00000000..a68eeff2
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/etcd.yaml
@@ -0,0 +1,136 @@
+# Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: etcd
+ rules:
+ - alert: etcdInsufficientMembers
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": insufficient members ({{`{{ $value }}`}}).'
+ expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdNoLeader
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": member {{`{{ $labels.instance }}`}} has no leader.'
+ expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ - alert: etcdHighNumberOfLeaderChanges
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader changes within the last hour.'
+ expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 5
+ for: 5m
+ labels:
+ severity: critical
+ - alert: etcdGRPCRequestsSlow
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": gRPC requests to {{`{{ $labels.grpc_method }}`}} are taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdMemberCommunicationSlow
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": member communication with {{`{{ $labels.To }}`}} is taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedProposals
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}} proposal failures within the last hour on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile fync durations are {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighCommitDurations
+ annotations:
+ message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile commit durations {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.05
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdHTTPRequestsSlow
+ annotations:
+ message: etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method }}`}} are slow.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/general.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/general.rules.yaml
new file mode 100644
index 00000000..9f8349f9
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/general.rules.yaml
@@ -0,0 +1,46 @@
+# Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.general }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: general.rules
+ rules:
+ - alert: TargetDown
+ annotations:
+ message: '{{`{{ $value }}`}}% of the {{`{{ $labels.job }}`}} targets are down.'
+ expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+ for: 10m
+ labels:
+ severity: warning
+ - alert: Watchdog
+ annotations:
+ message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
+
+ This alert is always firing, therefore it should always be firing in Alertmanager
+
+ and always fire against a receiver. There are integrations with various notification
+
+ mechanisms that send a notification when this alert is not firing. For example the
+
+ "DeadMansSnitch" integration in PagerDuty.
+
+ '
+ expr: vector(1)
+ labels:
+ severity: none
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml
new file mode 100644
index 00000000..678df008
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml
@@ -0,0 +1,60 @@
+# Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: k8s.rules
+ rules:
+ - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
+ record: namespace:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by (namespace, pod_name, container_name) (
+ rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
+ )
+ record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
+ - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
+ record: namespace:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
+ * on (namespace, pod_name) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
+ * on (namespace, pod_name) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod)
+ * on (namespace, pod) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
+ - expr: |-
+ sum by (namespace, label_name) (
+ sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
+ * on (namespace, pod) group_left(label_name)
+ label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
+ )
+ record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml
new file mode 100644
index 00000000..cbb19cb7
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml
@@ -0,0 +1,35 @@
+# Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml
new file mode 100644
index 00000000..2df9a096
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml
@@ -0,0 +1,37 @@
+# Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeAlerting }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-alerting.rules
+ rules:
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 24 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)'
+ for: 30m
+ labels:
+ severity: warning
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 2 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)'
+ for: 10m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml
new file mode 100644
index 00000000..0d2ff510
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml
@@ -0,0 +1,37 @@
+# Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-recording.rules
+ rules:
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)
+ record: instance:node_cpu:rate:sum
+ - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)
+ record: instance:node_filesystem_usage:sum
+ - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
+ record: instance:node_network_receive_bytes:rate:sum
+ - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
+ record: instance:node_network_transmit_bytes:rate:sum
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
+ record: instance:node_cpu:ratio
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
+ record: cluster:node_cpu:sum_rate5m
+ - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
+ record: cluster:node_cpu:ratio
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml
new file mode 100644
index 00000000..e51b0181
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml
@@ -0,0 +1,59 @@
+# Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-scheduler.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml
new file mode 100644
index 00000000..19b09491
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml
@@ -0,0 +1,123 @@
+# Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubernetesAbsent }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+{{- $namespace := .Release.Namespace }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-absent
+ rules:
+ - alert: AlertmanagerDown
+ annotations:
+ message: Alertmanager has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
+ expr: absent(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.kubeDns.enabled }}
+ - alert: CoreDNSDown
+ annotations:
+ message: CoreDNS has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
+ expr: absent(up{job="kube-dns"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeApiServer.enabled }}
+ - alert: KubeAPIDown
+ annotations:
+ message: KubeAPI has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
+ expr: absent(up{job="apiserver"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeControllerManager.enabled }}
+ - alert: KubeControllerManagerDown
+ annotations:
+ message: KubeControllerManager has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
+ expr: absent(up{job="kube-controller-manager"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeScheduler.enabled }}
+ - alert: KubeSchedulerDown
+ annotations:
+ message: KubeScheduler has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
+ expr: absent(up{job="kube-scheduler"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeStateMetrics.enabled }}
+ - alert: KubeStateMetricsDown
+ annotations:
+ message: KubeStateMetrics has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
+ expr: absent(up{job="kube-state-metrics"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.prometheusOperator.kubeletService.enabled }}
+ - alert: KubeletDown
+ annotations:
+ message: Kubelet has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
+ expr: absent(up{job="kubelet"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.nodeExporter.enabled }}
+ - alert: NodeExporterDown
+ annotations:
+ message: NodeExporter has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
+ expr: absent(up{job="node-exporter"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+ - alert: PrometheusDown
+ annotations:
+ message: Prometheus has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
+ expr: absent(up{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.prometheusOperator.enabled }}
+ - alert: PrometheusOperatorDown
+ annotations:
+ message: PrometheusOperator has disappeared from Prometheus target discovery.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
+ expr: absent(up{job="{{ $operatorJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml
new file mode 100644
index 00000000..d3d2c498
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml
@@ -0,0 +1,156 @@
+# Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.kubeStateMetrics.enabled .Values.defaultRules.rules.kubernetesApps }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-apps
+ rules:
+ - alert: KubePodCrashLooping
+ annotations:
+ message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} ({{`{{ $labels.container }}`}}) is restarting {{`{{ printf "%.2f" $value }}`}} times / 5 minutes.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
+ expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubePodNotReady
+ annotations:
+ message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} has been in a non-ready state for longer than an hour.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
+ expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeDeploymentGenerationMismatch
+ annotations:
+ message: Deployment generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
+ expr: |-
+ kube_deployment_status_observed_generation{job="kube-state-metrics"}
+ !=
+ kube_deployment_metadata_generation{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDeploymentReplicasMismatch
+ annotations:
+ message: Deployment {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} has not matched the expected number of replicas for longer than an hour.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
+ expr: |-
+ kube_deployment_spec_replicas{job="kube-state-metrics"}
+ !=
+ kube_deployment_status_replicas_available{job="kube-state-metrics"}
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetReplicasMismatch
+ annotations:
+ message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} has not matched the expected number of replicas for longer than 15 minutes.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
+ expr: |-
+ kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
+ !=
+ kube_statefulset_status_replicas{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetGenerationMismatch
+ annotations:
+ message: StatefulSet generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
+ expr: |-
+ kube_statefulset_status_observed_generation{job="kube-state-metrics"}
+ !=
+ kube_statefulset_metadata_generation{job="kube-state-metrics"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetUpdateNotRolledOut
+ annotations:
+ message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} update has not been rolled out.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
+ expr: |-
+ max without (revision) (
+ kube_statefulset_status_current_revision{job="kube-state-metrics"}
+ unless
+ kube_statefulset_status_update_revision{job="kube-state-metrics"}
+ )
+ *
+ (
+ kube_statefulset_replicas{job="kube-state-metrics"}
+ !=
+ kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetRolloutStuck
+ annotations:
+ message: Only {{`{{ $value }}`}}% of the desired Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are scheduled and ready.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
+ expr: |-
+ kube_daemonset_status_number_ready{job="kube-state-metrics"}
+ /
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetNotScheduled
+ annotations:
+ message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are not scheduled.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
+ expr: |-
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
+ -
+ kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeDaemonSetMisScheduled
+ annotations:
+ message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are running where they are not supposed to run.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
+ expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeCronJobRunning
+ annotations:
+ message: CronJob {{`{{ $labels.namespace }}`}}/{{`{{ $labels.cronjob }}`}} is taking more than 1h to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
+ expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobCompletion
+ annotations:
+ message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} is taking more than one hour to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
+ expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobFailed
+ annotations:
+ message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} failed to complete.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
+ expr: kube_job_status_failed{job="kube-state-metrics"} > 0
+ for: 1h
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml
new file mode 100644
index 00000000..ed4a83c6
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml
@@ -0,0 +1,99 @@
+# Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-resources
+ rules:
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
+ /
+ sum(node:node_num_cpu:sum)
+ >
+ (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
+ /
+ sum(node_memory_MemTotal_bytes)
+ >
+ (count(node:node_num_cpu:sum)-1)
+ /
+ count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Namespaces.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
+ /
+ sum(node:node_num_cpu:sum)
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Namespaces.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeQuotaExceeded
+ annotations:
+ message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
+ expr: |-
+ 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
+ / ignoring(instance, job, type)
+ (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+ > 90
+ for: 15m
+ labels:
+ severity: warning
+ - alert: CPUThrottlingHigh
+ annotations:
+ message: '{{`{{ printf "%0.0f" $value }}`}}% throttling of CPU in namespace {{`{{ $labels.namespace }}`}} for container {{`{{ $labels.container_name }}`}} in pod {{`{{ $labels.pod_name }}`}}.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
+ expr: |-
+ 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace)
+ /
+ sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)
+ > 25
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml
new file mode 100644
index 00000000..edd8f5fc
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml
@@ -0,0 +1,58 @@
+# Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-storage
+ rules:
+ - alert: KubePersistentVolumeUsageCritical
+ annotations:
+ message: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ printf "%0.2f" $value }}`}}% free.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
+ expr: |-
+ 100 * kubelet_volume_stats_available_bytes{job="kubelet"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet"}
+ < 3
+ for: 1m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeFullInFourDays
+ annotations:
+ message: Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up within four days. Currently {{`{{ printf "%0.2f" $value }}`}}% is available.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
+ expr: |-
+ 100 * (
+ kubelet_volume_stats_available_bytes{job="kubelet"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet"}
+ ) < 15
+ and
+ predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
+ for: 5m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeErrors
+ annotations:
+ message: The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml
new file mode 100644
index 00000000..8ccfa5bf
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml
@@ -0,0 +1,119 @@
+# Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system
+ rules:
+ - alert: KubeNodeNotReady
+ annotations:
+ message: '{{`{{ $labels.node }}`}} has been unready for more than an hour.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
+ expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeVersionMismatch
+ annotations:
+ message: There are {{`{{ $value }}`}} different semantic versions of Kubernetes components running.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
+ expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}}% errors.'
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
+ expr: |-
+ (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
+ /
+ sum(rate(rest_client_requests_total[5m])) by (instance, job))
+ * 100 > 1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}} errors / second.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
+ expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeletTooManyPods
+ annotations:
+ message: Kubelet {{`{{ $labels.instance }}`}} is running {{`{{ $value }}`}} Pods, close to the limit of 110.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
+ expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{ $value }}`}}% of requests.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{ $value }}`}}% of requests.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 7 days.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
+ expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 24 hours.
+ runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
+ expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/node.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/node.rules.yaml
new file mode 100644
index 00000000..35245437
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/node.rules.yaml
@@ -0,0 +1,198 @@
+# Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node.rules
+ rules:
+ - expr: sum(min(kube_pod_info) by (node))
+ record: ':kube_pod_info_node_count:'
+ - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
+ record: 'node_namespace_pod:kube_pod_info:'
+ - expr: |-
+ count by (node) (sum by (node, cpu) (
+ node_cpu_seconds_total{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ ))
+ record: node:node_num_cpu:sum
+ - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
+ record: :node_cpu_utilisation:avg1m
+ - expr: |-
+ 1 - avg by (node) (
+ rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:)
+ record: node:node_cpu_utilisation:avg1m
+ - expr: |-
+ node:node_cpu_utilisation:avg1m
+ *
+ node:node_num_cpu:sum
+ /
+ scalar(sum(node:node_num_cpu:sum))
+ record: node:cluster_cpu_utilisation:ratio
+ - expr: |-
+ sum(node_load1{job="node-exporter"})
+ /
+ sum(node:node_num_cpu:sum)
+ record: ':node_cpu_saturation_load1:'
+ - expr: |-
+ sum by (node) (
+ node_load1{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ node:node_num_cpu:sum
+ record: 'node:node_cpu_saturation_load1:'
+ - expr: |-
+ 1 -
+ sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: ':node_memory_utilisation:'
+ - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ record: :node_memory_MemFreeCachedBuffers_bytes:sum
+ - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: :node_memory_MemTotal_bytes:sum
+ - expr: |-
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_available:sum
+ - expr: |-
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_total:sum
+ - expr: |-
+ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
+ /
+ node:node_memory_bytes_total:sum
+ record: node:node_memory_utilisation:ratio
+ - expr: |-
+ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
+ /
+ scalar(sum(node:node_memory_bytes_total:sum))
+ record: node:cluster_memory_utilisation:ratio
+ - expr: |-
+ 1e3 * sum(
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ )
+ record: :node_memory_swap_io_bytes:sum_rate
+ - expr: |-
+ 1 -
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: 'node:node_memory_utilisation:'
+ - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
+ record: 'node:node_memory_utilisation_2:'
+ - expr: |-
+ 1e3 * sum by (node) (
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_swap_io_bytes:sum_rate
+ - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
+ record: :node_disk_utilisation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_utilisation:avg_irate
+ - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
+ record: :node_disk_saturation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_saturation:avg_irate
+ - expr: |-
+ max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
+ - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_usage:'
+ - expr: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_avail:'
+ - expr: |-
+ sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
+ sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
+ record: :node_net_utilisation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
+ irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_utilisation:sum_irate
+ - expr: |-
+ sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
+ sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
+ record: :node_net_saturation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
+ irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_saturation:sum_irate
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_total:'
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_free:'
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml
new file mode 100644
index 00000000..774a540c
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml
@@ -0,0 +1,39 @@
+# Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $namespace := .Release.Namespace }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus-operator
+ rules:
+ - alert: PrometheusOperatorReconcileErrors
+ annotations:
+ message: Errors while reconciling {{`{{ $labels.controller }}`}} in {{`{{ $labels.namespace }}`}} Namespace.
+ expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusOperatorNodeLookupErrors
+ annotations:
+ message: Errors while reconciling Prometheus in {{`{{ $labels.namespace }}`}} Namespace.
+ expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml
new file mode 100644
index 00000000..3c9e1490
--- /dev/null
+++ b/vnfs/DAaaS/collection/charts/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml
@@ -0,0 +1,105 @@
+# Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
+# Do not change in-place! In order to change this file first read following link:
+# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+{{- if and .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $namespace := .Release.Namespace }}
+apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus.rules
+ rules:
+ - alert: PrometheusConfigReloadFailed
+ annotations:
+ description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
+ summary: Reloading Prometheus' configuration failed
+ expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusNotificationQueueRunningFull
+ annotations:
+ description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
+ summary: Prometheus' alert notification queue is running full
+ expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
+ summary: Errors while sending alert from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
+ summary: Errors while sending alerts from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03
+ for: 10m
+ labels:
+ severity: critical
+ - alert: PrometheusNotConnectedToAlertmanagers
+ annotations:
+ description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
+ summary: Prometheus is not connected to any Alertmanagers
+ expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBReloadsFailing
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
+ summary: Prometheus has issues reloading data blocks from disk
+ expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBCompactionsFailing
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
+ summary: Prometheus has issues compacting sample blocks
+ expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBWALCorruptions
+ annotations:
+ description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
+ summary: Prometheus write-ahead log is corrupted
+ expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0
+ for: 4h
+ labels:
+ severity: warning
+ - alert: PrometheusNotIngestingSamples
+ annotations:
+ description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
+ summary: Prometheus isn't ingesting samples
+ expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTargetScrapesDuplicate
+ annotations:
+ description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
+ summary: Prometheus has many samples rejected
+ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file