aboutsummaryrefslogtreecommitdiffstats
path: root/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14
diff options
context:
space:
mode:
Diffstat (limited to 'kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14')
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml54
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml155
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml50
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml121
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml71
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml393
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml31
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml41
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml63
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml51
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml39
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml205
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml103
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml63
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml100
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml37
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml84
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml37
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml47
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml79
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml202
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml34
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml53
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml43
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml202
25 files changed, 2358 insertions, 0 deletions
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml
new file mode 100755
index 00000000..1c6db409
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml
@@ -0,0 +1,54 @@
+{{- /*
+Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ annotations:
+ message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync.
+ expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: AlertmanagerFailedReload
+ annotations:
+ message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
+ expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: AlertmanagerMembersInconsistent
+ annotations:
+ message: Alertmanager has not found all other members of the cluster.
+ expr: |-
+ alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
+ != on (service) GROUP_LEFT()
+ count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml
new file mode 100755
index 00000000..97a9825d
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml
@@ -0,0 +1,155 @@
+{{- /*
+Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: etcd
+ rules:
+ - alert: etcdMembersDown
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
+ expr: |-
+ max by (job) (
+ sum by (job) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count by (job,endpoint) (
+ sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
+ )
+ )
+ > 0
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdInsufficientMembers
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
+ expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdNoLeader
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
+ expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ - alert: etcdHighNumberOfLeaderChanges
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
+ for: 5m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 5
+ for: 5m
+ labels:
+ severity: critical
+ - alert: etcdGRPCRequestsSlow
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdMemberCommunicationSlow
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedProposals
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighCommitDurations
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.05
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdHTTPRequestsSlow
+ annotations:
+ message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml
new file mode 100755
index 00000000..4ccd9441
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml
@@ -0,0 +1,50 @@
+{{- /*
+Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: general.rules
+ rules:
+ - alert: TargetDown
+ annotations:
+ message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
+ expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
+ for: 10m
+ labels:
+ severity: warning
+ - alert: Watchdog
+ annotations:
+ message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
+
+ This alert is always firing, therefore it should always be firing in Alertmanager
+
+ and always fire against a receiver. There are integrations with various notification
+
+ mechanisms that send a notification when this alert is not firing. For example the
+
+ "DeadMansSnitch" integration in PagerDuty.
+
+ '
+ expr: vector(1)
+ labels:
+ severity: none
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml
new file mode 100755
index 00000000..4bc2cc7d
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml
@@ -0,0 +1,121 @@
+{{- /*
+Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: k8s.rules
+ rules:
+ - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
+ record: namespace:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by (cluster, namespace, pod, container) (
+ rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
+ ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
+ 1, max by(cluster, namespace, pod, node) (kube_pod_info)
+ )
+ record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+ * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
+ max by(namespace, pod, node) (kube_pod_info)
+ )
+ record: node_namespace_pod_container:container_memory_working_set_bytes
+ - expr: |-
+ container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+ * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
+ max by(namespace, pod, node) (kube_pod_info)
+ )
+ record: node_namespace_pod_container:container_memory_rss
+ - expr: |-
+ container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+ * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
+ max by(namespace, pod, node) (kube_pod_info)
+ )
+ record: node_namespace_pod_container:container_memory_cache
+ - expr: |-
+ container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
+ * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
+ max by(namespace, pod, node) (kube_pod_info)
+ )
+ record: node_namespace_pod_container:container_memory_swap
+ - expr: sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
+ record: namespace:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace) (
+ sum by (namespace, pod) (
+ max by (namespace, pod, container) (
+ kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
+ ) * on(namespace, pod) group_left() max by (namespace, pod) (
+ kube_pod_status_phase{phase=~"Pending|Running"} == 1
+ )
+ )
+ )
+ record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
+ - expr: |-
+ sum by (namespace) (
+ sum by (namespace, pod) (
+ max by (namespace, pod, container) (
+ kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
+ ) * on(namespace, pod) group_left() max by (namespace, pod) (
+ kube_pod_status_phase{phase=~"Pending|Running"} == 1
+ )
+ )
+ )
+ record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
+ - expr: |-
+ max by (cluster, namespace, workload, pod) (
+ label_replace(
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
+ "replicaset", "$1", "owner_name", "(.*)"
+ ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
+ 1, max by (replicaset, namespace, owner_name) (
+ kube_replicaset_owner{job="kube-state-metrics"}
+ )
+ ),
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ )
+ labels:
+ workload_type: deployment
+ record: mixin_pod_workload
+ - expr: |-
+ max by (cluster, namespace, workload, pod) (
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ )
+ labels:
+ workload_type: daemonset
+ record: mixin_pod_workload
+ - expr: |-
+ max by (cluster, namespace, workload, pod) (
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ )
+ labels:
+ workload_type: statefulset
+ record: mixin_pod_workload
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml
new file mode 100755
index 00000000..010d3446
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml
@@ -0,0 +1,71 @@
+{{- /*
+Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver-slos
+ rules:
+ - alert: KubeAPIErrorBudgetBurn
+ annotations:
+ message: The API server is burning too much error budget
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ expr: |-
+ sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
+ and
+ sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
+ for: 2m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorBudgetBurn
+ annotations:
+ message: The API server is burning too much error budget
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ expr: |-
+ sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
+ and
+ sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorBudgetBurn
+ annotations:
+ message: The API server is burning too much error budget
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ expr: |-
+ sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
+ and
+ sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeAPIErrorBudgetBurn
+ annotations:
+ message: The API server is burning too much error budget
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn
+ expr: |-
+ sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
+ and
+ sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
+ for: 3h
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml
new file mode 100755
index 00000000..1b00134e
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml
@@ -0,0 +1,393 @@
+{{- /*
+Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver.rules
+ rules:
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate1d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate1h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate2h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate30m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate3d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate5m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
+ -
+ (
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
+ )
+ )
+ +
+ # errors
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
+ labels:
+ verb: read
+ record: apiserver_request:burnrate6h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate1d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate1h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate2h
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate30m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate3d
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate5m
+ - expr: |-
+ (
+ (
+ # too slow
+ sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
+ -
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
+ )
+ +
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
+ )
+ /
+ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
+ labels:
+ verb: write
+ record: apiserver_request:burnrate6h
+ - expr: |-
+ 1 - (
+ (
+ # write too slow
+ sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
+ -
+ sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
+ ) +
+ (
+ # read too slow
+ sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
+ -
+ (
+ sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
+ sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
+ sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
+ )
+ ) +
+ # errors
+ sum(code:apiserver_request_total:increase30d{code=~"5.."})
+ )
+ /
+ sum(code:apiserver_request_total:increase30d)
+ labels:
+ verb: all
+ record: apiserver_request:availability30d
+ - expr: |-
+ 1 - (
+ sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
+ -
+ (
+ # too slow
+ sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) +
+ sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
+ sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
+ )
+ +
+ # errors
+ sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."})
+ )
+ /
+ sum(code:apiserver_request_total:increase30d{verb="read"})
+ labels:
+ verb: read
+ record: apiserver_request:availability30d
+ - expr: |-
+ 1 - (
+ (
+ # too slow
+ sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
+ -
+ sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
+ )
+ +
+ # errors
+ sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."})
+ )
+ /
+ sum(code:apiserver_request_total:increase30d{verb="write"})
+ labels:
+ verb: write
+ record: apiserver_request:availability30d
+ - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver"}[30d]))
+ record: code_verb:apiserver_request_total:increase30d
+ - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
+ labels:
+ verb: read
+ record: code:apiserver_request_total:increase30d
+ - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
+ labels:
+ verb: write
+ record: code:apiserver_request_total:increase30d
+ - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
+ labels:
+ verb: read
+ record: code_resource:apiserver_request_total:rate5m
+ - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
+ labels:
+ verb: write
+ record: code_resource:apiserver_request_total:rate5m
+ - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
+ labels:
+ quantile: '0.99'
+ verb: read
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
+ labels:
+ quantile: '0.99'
+ verb: write
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: |-
+ sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
+ /
+ sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
+ record: cluster:apiserver_request_duration_seconds:mean5m
+ - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml
new file mode 100755
index 00000000..0b963276
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml
@@ -0,0 +1,31 @@
+{{- /*
+Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-general.rules
+ rules:
+ - expr: count without(instance, pod, node) (up == 1)
+ record: count:up1
+ - expr: count without(instance, pod, node) (up == 0)
+ record: count:up0
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml
new file mode 100755
index 00000000..1ff4cb07
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml
@@ -0,0 +1,41 @@
+{{- /*
+Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-recording.rules
+ rules:
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)
+ record: instance:node_cpu:rate:sum
+ - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)
+ record: instance:node_filesystem_usage:sum
+ - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
+ record: instance:node_network_receive_bytes:rate:sum
+ - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
+ record: instance:node_network_transmit_bytes:rate:sum
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
+ record: instance:node_cpu:ratio
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
+ record: cluster:node_cpu:sum_rate5m
+ - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
+ record: cluster:node_cpu:ratio
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml
new file mode 100755
index 00000000..ec718cef
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml
@@ -0,0 +1,63 @@
+{{- /*
+Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-scheduler.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml
new file mode 100755
index 00000000..6f281bcb
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml
@@ -0,0 +1,51 @@
+{{- /*
+Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-state-metrics
+ rules:
+ - alert: KubeStateMetricsListErrors
+ annotations:
+ message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors
+ expr: |-
+ (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
+ /
+ sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
+ > 0.01
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStateMetricsWatchErrors
+ annotations:
+ message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors
+ expr: |-
+ (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
+ /
+ sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
+ > 0.01
+ for: 15m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml
new file mode 100755
index 00000000..9d9fa950
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml
@@ -0,0 +1,39 @@
+{{- /*
+Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubelet.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
+ labels:
+ quantile: '0.99'
+ record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
+ labels:
+ quantile: '0.9'
+ record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
+ labels:
+ quantile: '0.5'
+ record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml
new file mode 100755
index 00000000..3ae09119
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml
@@ -0,0 +1,205 @@
+{{- /*
+Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeStateMetrics.enabled .Values.defaultRules.rules.kubernetesApps }}
+{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-apps
+ rules:
+ - alert: KubePodCrashLooping
+ annotations:
+ message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
+ expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubePodNotReady
+ annotations:
+ message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
+ expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDeploymentGenerationMismatch
+ annotations:
+ message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
+ expr: |-
+ kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDeploymentReplicasMismatch
+ annotations:
+ message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
+ expr: |-
+ (
+ kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ ) and (
+ changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
+ ==
+ 0
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetReplicasMismatch
+ annotations:
+ message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
+ expr: |-
+ (
+ kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ ) and (
+ changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
+ ==
+ 0
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetGenerationMismatch
+ annotations:
+ message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
+ expr: |-
+ kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetUpdateNotRolledOut
+ annotations:
+ message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
+ expr: |-
+ max without (revision) (
+ kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ unless
+ kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ )
+ *
+ (
+ kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetRolloutStuck
+ annotations:
+ message: Only {{`{{`}} $value | humanizePercentage {{`}}`}} of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
+ expr: |-
+ kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ /
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} < 1.00
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeContainerWaiting
+ annotations:
+ message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting
+ expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeDaemonSetNotScheduled
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
+ expr: |-
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ -
+ kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeDaemonSetMisScheduled
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
+ expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeCronJobRunning
+ annotations:
+ message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning
+ expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobCompletion
+ annotations:
+ message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
+ expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobFailed
+ annotations:
+ message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
+ expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeHpaReplicasMismatch
+ annotations:
+ message: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
+ expr: |-
+ (kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
+ and
+ changes(kube_hpa_status_current_replicas[15m]) == 0
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeHpaMaxedOut
+ annotations:
+ message: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has been running at max replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
+ expr: |-
+ kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ ==
+ kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml
new file mode 100755
index 00000000..0247f5eb
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml
@@ -0,0 +1,103 @@
+{{- /*
+Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-resources
+ rules:
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
+ expr: |-
+ sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
+ /
+ sum(kube_node_status_allocatable_cpu_cores)
+ >
+ (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemoryOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit
+ expr: |-
+ sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
+ /
+ sum(kube_node_status_allocatable_memory_bytes)
+ >
+ (count(kube_node_status_allocatable_memory_bytes)-1)
+ /
+ count(kube_node_status_allocatable_memory_bytes)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeCPUQuotaOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Namespaces.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
+ /
+ sum(kube_node_status_allocatable_cpu_cores)
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemoryQuotaOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Namespaces.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
+ /
+ sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeQuotaExceeded
+ annotations:
+ message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded
+ expr: |-
+ kube_resourcequota{job="kube-state-metrics", type="used"}
+ / ignoring(instance, job, type)
+ (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+ > 0.90
+ for: 15m
+ labels:
+ severity: warning
+ - alert: CPUThrottlingHigh
+ annotations:
+ message: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh
+ expr: |-
+ sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
+ /
+ sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
+ > ( 25 / 100 )
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml
new file mode 100755
index 00000000..f2573966
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml
@@ -0,0 +1,63 @@
+{{- /*
+Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
+{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-storage
+ rules:
+ - alert: KubePersistentVolumeFillingUp
+ annotations:
+ message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
+ expr: |-
+ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ < 0.03
+ for: 1m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeFillingUp
+ annotations:
+ message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
+ expr: |-
+ (
+ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
+ ) < 0.15
+ and
+ predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubePersistentVolumeErrors
+ annotations:
+ message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
new file mode 100755
index 00000000..7583a599
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
@@ -0,0 +1,100 @@
+{{- /*
+Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system-apiserver
+ rules:
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has an abnormal latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
+ expr: |-
+ (
+ cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
+ >
+ on (verb) group_left()
+ (
+ avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+ +
+ 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+ )
+ ) > on (verb) group_left()
+ 1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+ and on (verb,resource)
+ cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
+ >
+ 1
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{`}} $value | humanizePercentage {{`}}`}} of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb)
+ /
+ sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
+ labels:
+ severity: critical
+ - alert: AggregatedAPIErrors
+ annotations:
+ message: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors
+ expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
+ labels:
+ severity: warning
+ - alert: AggregatedAPIDown
+ annotations:
+ message: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} is down. It has not been available at least for the past five minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown
+ expr: sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
+ for: 5m
+ labels:
+ severity: warning
+{{- if .Values.kubeApiServer.enabled }}
+ - alert: KubeAPIDown
+ annotations:
+ message: KubeAPI has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown
+ expr: absent(up{job="apiserver"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml
new file mode 100755
index 00000000..6214d775
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml
@@ -0,0 +1,37 @@
+{{- /*
+Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system-controller-manager
+ rules:
+{{- if .Values.kubeControllerManager.enabled }}
+ - alert: KubeControllerManagerDown
+ annotations:
+ message: KubeControllerManager has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown
+ expr: absent(up{job="kube-controller-manager"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml
new file mode 100755
index 00000000..bb7a2838
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml
@@ -0,0 +1,84 @@
+{{- /*
+Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system-kubelet
+ rules:
+ - alert: KubeNodeNotReady
+ annotations:
+ message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
+ expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeNodeUnreachable
+ annotations:
+ message: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable
+ expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
+ labels:
+ severity: warning
+ - alert: KubeletTooManyPods
+ annotations:
+ message: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
+ expr: max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeNodeReadinessFlapping
+ annotations:
+ message: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping
+ expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeletPlegDurationHigh
+ annotations:
+ message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh
+ expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeletPodStartUpLatencyHigh
+ annotations:
+ message: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh
+ expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
+ for: 15m
+ labels:
+ severity: warning
+{{- if .Values.prometheusOperator.kubeletService.enabled }}
+ - alert: KubeletDown
+ annotations:
+ message: Kubelet has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown
+ expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }}
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml
new file mode 100755
index 00000000..84e3b1ab
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml
@@ -0,0 +1,37 @@
+{{- /*
+Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system-scheduler
+ rules:
+{{- if .Values.kubeScheduler.enabled }}
+ - alert: KubeSchedulerDown
+ annotations:
+ message: KubeScheduler has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown
+ expr: absent(up{job="kube-scheduler"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml
new file mode 100755
index 00000000..fc455f37
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml
@@ -0,0 +1,47 @@
+{{- /*
+Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system
+ rules:
+ - alert: KubeVersionMismatch
+ annotations:
+ message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
+ expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
+ expr: |-
+ (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
+ /
+ sum(rate(rest_client_requests_total[5m])) by (instance, job))
+ > 0.01
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml
new file mode 100755
index 00000000..7adf85db
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml
@@ -0,0 +1,79 @@
+{{- /*
+Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node-exporter.rules
+ rules:
+ - expr: |-
+ count without (cpu) (
+ count without (mode) (
+ node_cpu_seconds_total{job="node-exporter"}
+ )
+ )
+ record: instance:node_num_cpu:sum
+ - expr: |-
+ 1 - avg without (cpu, mode) (
+ rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
+ )
+ record: instance:node_cpu_utilisation:rate1m
+ - expr: |-
+ (
+ node_load1{job="node-exporter"}
+ /
+ instance:node_num_cpu:sum{job="node-exporter"}
+ )
+ record: instance:node_load1_per_cpu:ratio
+ - expr: |-
+ 1 - (
+ node_memory_MemAvailable_bytes{job="node-exporter"}
+ /
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ )
+ record: instance:node_memory_utilisation:ratio
+ - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
+ record: instance:node_vmstat_pgmajfault:rate1m
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
+ record: instance_device:node_disk_io_time_seconds:rate1m
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
+ record: instance_device:node_disk_io_time_weighted_seconds:rate1m
+ - expr: |-
+ sum without (device) (
+ rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
+ )
+ record: instance:node_network_receive_bytes_excluding_lo:rate1m
+ - expr: |-
+ sum without (device) (
+ rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
+ )
+ record: instance:node_network_transmit_bytes_excluding_lo:rate1m
+ - expr: |-
+ sum without (device) (
+ rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
+ )
+ record: instance:node_network_receive_drop_excluding_lo:rate1m
+ - expr: |-
+ sum without (device) (
+ rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
+ )
+ record: instance:node_network_transmit_drop_excluding_lo:rate1m
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml
new file mode 100755
index 00000000..7b6c601b
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml
@@ -0,0 +1,202 @@
+{{- /*
+Generated from 'node-exporter' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node-exporter
+ rules:
+ - alert: NodeFilesystemSpaceFillingUp
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
+ summary: Filesystem is predicted to run out of space within the next 24 hours.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
+ and
+ predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemSpaceFillingUp
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup
+ summary: Filesystem is predicted to run out of space within the next 4 hours.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
+ and
+ predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeFilesystemAlmostOutOfSpace
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
+ summary: Filesystem has less than 5% space left.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemAlmostOutOfSpace
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace
+ summary: Filesystem has less than 3% space left.
+ expr: |-
+ (
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeFilesystemFilesFillingUp
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
+ summary: Filesystem is predicted to run out of inodes within the next 24 hours.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
+ and
+ predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemFilesFillingUp
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup
+ summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
+ and
+ predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeFilesystemAlmostOutOfFiles
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
+ summary: Filesystem has less than 5% inodes left.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeFilesystemAlmostOutOfFiles
+ annotations:
+ description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles
+ summary: Filesystem has less than 3% inodes left.
+ expr: |-
+ (
+ node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
+ and
+ node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
+ )
+ for: 1h
+ labels:
+ severity: critical
+ - alert: NodeNetworkReceiveErrs
+ annotations:
+ description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs
+ summary: Network interface is reporting many receive errors.
+ expr: increase(node_network_receive_errs_total[2m]) > 10
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeNetworkTransmitErrs
+ annotations:
+ description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs
+ summary: Network interface is reporting many transmit errors.
+ expr: increase(node_network_transmit_errs_total[2m]) > 10
+ for: 1h
+ labels:
+ severity: warning
+ - alert: NodeHighNumberConntrackEntriesUsed
+ annotations:
+ description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused
+ summary: Number of conntrack are getting close to the limit
+ expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+ labels:
+ severity: warning
+ - alert: NodeClockSkewDetected
+ annotations:
+ message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected
+ summary: Clock skew detected.
+ expr: |-
+ (
+ node_timex_offset_seconds > 0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) >= 0
+ )
+ or
+ (
+ node_timex_offset_seconds < -0.05
+ and
+ deriv(node_timex_offset_seconds[5m]) <= 0
+ )
+ for: 10m
+ labels:
+ severity: warning
+ - alert: NodeClockNotSynchronising
+ annotations:
+ message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising
+ summary: Clock not synchronising.
+ expr: min_over_time(node_timex_sync_status[5m]) == 0
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml
new file mode 100755
index 00000000..b4b206d8
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml
@@ -0,0 +1,34 @@
+{{- /*
+Generated from 'node-network' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node-network
+ rules:
+ - alert: NodeNetworkInterfaceFlapping
+ annotations:
+ message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
+ expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
+ for: 2m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml
new file mode 100755
index 00000000..75ab27ed
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml
@@ -0,0 +1,53 @@
+{{- /*
+Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node.rules
+ rules:
+ - expr: sum(min(kube_pod_info) by (cluster, node))
+ record: ':kube_pod_info_node_count:'
+ - expr: |-
+ topk by(namespace, pod) (1,
+ max by (node, namespace, pod) (
+ label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
+ ))
+ record: 'node_namespace_pod:kube_pod_info:'
+ - expr: |-
+ count by (cluster, node) (sum by (node, cpu) (
+ node_cpu_seconds_total{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ ))
+ record: node:node_num_cpu:sum
+ - expr: |-
+ sum(
+ node_memory_MemAvailable_bytes{job="node-exporter"} or
+ (
+ node_memory_Buffers_bytes{job="node-exporter"} +
+ node_memory_Cached_bytes{job="node-exporter"} +
+ node_memory_MemFree_bytes{job="node-exporter"} +
+ node_memory_Slab_bytes{job="node-exporter"}
+ )
+ ) by (cluster)
+ record: :node_memory_MemAvailable_bytes:sum
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml
new file mode 100755
index 00000000..98f2d3bc
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml
@@ -0,0 +1,43 @@
+{{- /*
+Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus-operator
+ rules:
+ - alert: PrometheusOperatorReconcileErrors
+ annotations:
+ message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
+ expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusOperatorNodeLookupErrors
+ annotations:
+ message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
+ expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml
new file mode 100755
index 00000000..71883f7e
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml
@@ -0,0 +1,202 @@
+{{- /*
+Generated from 'prometheus' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus
+ rules:
+ - alert: PrometheusBadConfig
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
+ summary: Failed Prometheus configuration reload.
+ expr: |-
+ # Without max_over_time, failed scrapes could create false negatives, see
+ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+ max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0
+ for: 10m
+ labels:
+ severity: critical
+ - alert: PrometheusNotificationQueueRunningFull
+ annotations:
+ description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
+ summary: Prometheus alert notification queue predicted to run full in less than 30m.
+ expr: |-
+ # Without min_over_time, failed scrapes could create false negatives, see
+ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+ (
+ predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30)
+ >
+ min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ for: 15m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
+ annotations:
+ description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
+ summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
+ expr: |-
+ (
+ rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ /
+ rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ * 100
+ > 1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
+ annotations:
+ description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
+ summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
+ expr: |-
+ min without(alertmanager) (
+ rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ /
+ rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ * 100
+ > 3
+ for: 15m
+ labels:
+ severity: critical
+ - alert: PrometheusNotConnectedToAlertmanagers
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
+ summary: Prometheus is not connected to any Alertmanagers.
+ expr: |-
+ # Without max_over_time, failed scrapes could create false negatives, see
+ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+ max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBReloadsFailing
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
+ summary: Prometheus has issues reloading blocks from disk.
+ expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
+ for: 4h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBCompactionsFailing
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
+ summary: Prometheus has issues compacting blocks.
+ expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
+ for: 4h
+ labels:
+ severity: warning
+ - alert: PrometheusNotIngestingSamples
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
+ summary: Prometheus is not ingesting samples.
+ expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusDuplicateTimestamps
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp.
+ summary: Prometheus is dropping samples with duplicate timestamps.
+ expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusOutOfOrderTimestamps
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order.
+ summary: Prometheus drops samples with out-of-order timestamps.
+ expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusRemoteStorageFailures
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
+ summary: Prometheus fails to send samples to remote storage.
+ expr: |-
+ (
+ rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ /
+ (
+ rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ +
+ rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ )
+ * 100
+ > 1
+ for: 15m
+ labels:
+ severity: critical
+ - alert: PrometheusRemoteWriteBehind
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
+ summary: Prometheus remote write is behind.
+ expr: |-
+ # Without max_over_time, failed scrapes could create false negatives, see
+ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+ (
+ max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ - on(job, instance) group_right
+ max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ > 120
+ for: 15m
+ labels:
+ severity: critical
+ - alert: PrometheusRemoteWriteDesiredShards
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
+ summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
+ expr: |-
+ # Without max_over_time, failed scrapes could create false negatives, see
+ # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+ (
+ max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ >
+ max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
+ )
+ for: 15m
+ labels:
+ severity: warning
+ - alert: PrometheusRuleFailures
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
+ summary: Prometheus is failing rule evaluations.
+ expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 15m
+ labels:
+ severity: critical
+ - alert: PrometheusMissingRuleEvaluations
+ annotations:
+ description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
+ summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
+ expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file