aboutsummaryrefslogtreecommitdiffstats
path: root/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules
diff options
context:
space:
mode:
Diffstat (limited to 'kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules')
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml54
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/etcd.yaml155
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/general.rules.yaml50
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml83
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml39
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml41
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml41
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml63
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml129
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml161
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml103
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml63
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml145
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-network.yaml48
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-time.yaml34
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node.rules.yaml202
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml43
-rwxr-xr-xkud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml109
18 files changed, 1563 insertions, 0 deletions
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml
new file mode 100755
index 00000000..54440239
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/alertmanager.rules.yaml
@@ -0,0 +1,54 @@
+{{- /*
+Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ annotations:
+ message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync.
+ expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ - alert: AlertmanagerFailedReload
+ annotations:
+ message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
+ expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: AlertmanagerMembersInconsistent
+ annotations:
+ message: Alertmanager has not found all other members of the cluster.
+ expr: |-
+ alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
+ != on (service) GROUP_LEFT()
+ count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/etcd.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/etcd.yaml
new file mode 100755
index 00000000..6abda2d3
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/etcd.yaml
@@ -0,0 +1,155 @@
+{{- /*
+Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: etcd
+ rules:
+ - alert: etcdMembersDown
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
+ expr: |-
+ max by (job) (
+ sum by (job) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count by (job,endpoint) (
+ sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
+ )
+ )
+ > 0
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdInsufficientMembers
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
+ expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ for: 3m
+ labels:
+ severity: critical
+ - alert: etcdNoLeader
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
+ expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ - alert: etcdHighNumberOfLeaderChanges
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
+ for: 5m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedGRPCRequests
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ /
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ > 5
+ for: 5m
+ labels:
+ severity: critical
+ - alert: etcdGRPCRequestsSlow
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdMemberCommunicationSlow
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedProposals
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+ for: 15m
+ labels:
+ severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighCommitDurations
+ annotations:
+ message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: etcdHighNumberOfFailedHTTPRequests
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
+ expr: |-
+ sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
+ BY (method) > 0.05
+ for: 10m
+ labels:
+ severity: critical
+ - alert: etcdHTTPRequestsSlow
+ annotations:
+ message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
+ > 0.15
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/general.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/general.rules.yaml
new file mode 100755
index 00000000..d220cb38
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/general.rules.yaml
@@ -0,0 +1,50 @@
+{{- /*
+Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: general.rules
+ rules:
+ - alert: TargetDown
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.'
+ expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
+ for: 10m
+ labels:
+ severity: warning
+ - alert: Watchdog
+ annotations:
+ message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
+
+ This alert is always firing, therefore it should always be firing in Alertmanager
+
+ and always fire against a receiver. There are integrations with various notification
+
+ mechanisms that send a notification when this alert is not firing. For example the
+
+ "DeadMansSnitch" integration in PagerDuty.
+
+ '
+ expr: vector(1)
+ labels:
+ severity: none
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml
new file mode 100755
index 00000000..71c75fcc
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/k8s.rules.yaml
@@ -0,0 +1,83 @@
+{{- /*
+Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: k8s.rules
+ rules:
+ - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
+ record: namespace:container_cpu_usage_seconds_total:sum_rate
+ - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
+ record: namespace:container_memory_usage_bytes:sum
+ - expr: |-
+ sum by (namespace, pod_name, container_name) (
+ rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
+ )
+ record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
+ - expr: |-
+ sum by(namespace) (
+ kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
+ * on (endpoint, instance, job, namespace, pod, service)
+ group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
+ )
+ record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
+ - expr: |-
+ sum by (namespace) (
+ kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
+ * on (endpoint, instance, job, namespace, pod, service)
+ group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
+ )
+ record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
+ - expr: |-
+ sum(
+ label_replace(
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
+ "replicaset", "$1", "owner_name", "(.*)"
+ ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ ) by (namespace, workload, pod)
+ labels:
+ workload_type: deployment
+ record: mixin_pod_workload
+ - expr: |-
+ sum(
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ ) by (namespace, workload, pod)
+ labels:
+ workload_type: daemonset
+ record: mixin_pod_workload
+ - expr: |-
+ sum(
+ label_replace(
+ kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
+ "workload", "$1", "owner_name", "(.*)"
+ )
+ ) by (namespace, workload, pod)
+ labels:
+ workload_type: statefulset
+ record: mixin_pod_workload
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml
new file mode 100755
index 00000000..5e565317
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-apiserver.rules.yaml
@@ -0,0 +1,39 @@
+{{- /*
+Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-apiserver.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:apiserver_request_latencies:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml
new file mode 100755
index 00000000..09a7c754
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-alerting.rules.yaml
@@ -0,0 +1,41 @@
+{{- /*
+Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeAlerting }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-alerting.rules
+ rules:
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 24 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)'
+ for: 30m
+ labels:
+ severity: warning
+ - alert: NodeDiskRunningFull
+ annotations:
+ message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 2 hours.
+ expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)'
+ for: 10m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml
new file mode 100755
index 00000000..fc0f4830
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-prometheus-node-recording.rules.yaml
@@ -0,0 +1,41 @@
+{{- /*
+Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-prometheus-node-recording.rules
+ rules:
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)
+ record: instance:node_cpu:rate:sum
+ - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)
+ record: instance:node_filesystem_usage:sum
+ - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
+ record: instance:node_network_receive_bytes:rate:sum
+ - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
+ record: instance:node_network_transmit_bytes:rate:sum
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
+ record: instance:node_cpu:ratio
+ - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
+ record: cluster:node_cpu:sum_rate5m
+ - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
+ record: cluster:node_cpu:ratio
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml
new file mode 100755
index 00000000..3861fa63
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kube-scheduler.rules.yaml
@@ -0,0 +1,63 @@
+{{- /*
+Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kube-scheduler.rules
+ rules:
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.99'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.9'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
+ - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
+ labels:
+ quantile: '0.5'
+ record: cluster_quantile:scheduler_binding_latency:histogram_quantile
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml
new file mode 100755
index 00000000..7391f16b
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-absent.yaml
@@ -0,0 +1,129 @@
+{{- /*
+Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesAbsent }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-absent
+ rules:
+{{- if .Values.alertmanager.enabled }}
+ - alert: AlertmanagerDown
+ annotations:
+ message: Alertmanager has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerdown
+ expr: absent(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeDns.enabled }}
+ - alert: CoreDNSDown
+ annotations:
+ message: CoreDNS has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-corednsdown
+ expr: absent(up{job="kube-dns"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeApiServer.enabled }}
+ - alert: KubeAPIDown
+ annotations:
+ message: KubeAPI has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown
+ expr: absent(up{job="apiserver"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeControllerManager.enabled }}
+ - alert: KubeControllerManagerDown
+ annotations:
+ message: KubeControllerManager has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown
+ expr: absent(up{job="kube-controller-manager"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeScheduler.enabled }}
+ - alert: KubeSchedulerDown
+ annotations:
+ message: KubeScheduler has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown
+ expr: absent(up{job="kube-scheduler"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.kubeStateMetrics.enabled }}
+ - alert: KubeStateMetricsDown
+ annotations:
+ message: KubeStateMetrics has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsdown
+ expr: absent(up{job="kube-state-metrics"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.prometheusOperator.kubeletService.enabled }}
+ - alert: KubeletDown
+ annotations:
+ message: Kubelet has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown
+ expr: absent(up{job="kubelet"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- if .Values.nodeExporter.enabled }}
+ - alert: NodeExporterDown
+ annotations:
+ message: NodeExporter has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeexporterdown
+ expr: absent(up{job="node-exporter"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+ - alert: PrometheusDown
+ annotations:
+ message: Prometheus has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusdown
+ expr: absent(up{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- if .Values.prometheusOperator.enabled }}
+ - alert: PrometheusOperatorDown
+ annotations:
+ message: PrometheusOperator has disappeared from Prometheus target discovery.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatordown
+ expr: absent(up{job="{{ $operatorJob }}",namespace="{{ $namespace }}"} == 1)
+ for: 15m
+ labels:
+ severity: critical
+{{- end }}
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml
new file mode 100755
index 00000000..fa82f081
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-apps.yaml
@@ -0,0 +1,161 @@
+{{- /*
+Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeStateMetrics.enabled .Values.defaultRules.rules.kubernetesApps }}
+{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-apps
+ rules:
+ - alert: KubePodCrashLooping
+ annotations:
+ message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
+ expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubePodNotReady
+ annotations:
+ message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than an hour.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
+ expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) > 0
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeDeploymentGenerationMismatch
+ annotations:
+ message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
+ expr: |-
+ kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDeploymentReplicasMismatch
+ annotations:
+ message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than an hour.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
+ expr: |-
+ kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 1h
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetReplicasMismatch
+ annotations:
+ message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
+ expr: |-
+ kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetGenerationMismatch
+ annotations:
+ message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
+ expr: |-
+ kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeStatefulSetUpdateNotRolledOut
+ annotations:
+ message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
+ expr: |-
+ max without (revision) (
+ kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ unless
+ kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ )
+ *
+ (
+ kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ !=
+ kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ )
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetRolloutStuck
+ annotations:
+ message: Only {{`{{`}} $value {{`}}`}}% of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
+ expr: |-
+ kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ /
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} * 100 < 100
+ for: 15m
+ labels:
+ severity: critical
+ - alert: KubeDaemonSetNotScheduled
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
+ expr: |-
+ kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
+ -
+ kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeDaemonSetMisScheduled
+ annotations:
+ message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
+ expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeCronJobRunning
+ annotations:
+ message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning
+ expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobCompletion
+ annotations:
+ message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
+ expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeJobFailed
+ annotations:
+ message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
+ expr: kube_job_status_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
+ for: 1h
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml
new file mode 100755
index 00000000..ee51ebd0
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-resources.yaml
@@ -0,0 +1,103 @@
+{{- /*
+Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-resources
+ rules:
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
+ /
+ sum(node:node_num_cpu:sum)
+ >
+ (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit
+ expr: |-
+ sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
+ /
+ sum(node_memory_MemTotal_bytes)
+ >
+ (count(node:node_num_cpu:sum)-1)
+ /
+ count(node:node_num_cpu:sum)
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeCPUOvercommit
+ annotations:
+ message: Cluster has overcommitted CPU resource requests for Namespaces.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
+ /
+ sum(node:node_num_cpu:sum)
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeMemOvercommit
+ annotations:
+ message: Cluster has overcommitted memory resource requests for Namespaces.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit
+ expr: |-
+ sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ > 1.5
+ for: 5m
+ labels:
+ severity: warning
+ - alert: KubeQuotaExceeded
+ annotations:
+ message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} printf "%0.0f" $value {{`}}`}}% of its {{`{{`}} $labels.resource {{`}}`}} quota.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded
+ expr: |-
+ 100 * kube_resourcequota{job="kube-state-metrics", type="used"}
+ / ignoring(instance, job, type)
+ (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
+ > 90
+ for: 15m
+ labels:
+ severity: warning
+ - alert: CPUThrottlingHigh
+ annotations:
+ message: '{{`{{`}} printf "%0.0f" $value {{`}}`}}% throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container_name {{`}}`}} in pod {{`{{`}} $labels.pod_name {{`}}`}}.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh
+ expr: |-
+ 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace)
+ /
+ sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)
+ > 25
+ for: 15m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml
new file mode 100755
index 00000000..715924b8
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-storage.yaml
@@ -0,0 +1,63 @@
+{{- /*
+Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
+{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-storage
+ rules:
+ - alert: KubePersistentVolumeUsageCritical
+ annotations:
+ message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} printf "%0.2f" $value {{`}}`}}% free.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeusagecritical
+ expr: |-
+ 100 * kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
+ < 3
+ for: 1m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeFullInFourDays
+ annotations:
+ message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} printf "%0.2f" $value {{`}}`}}% is available.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefullinfourdays
+ expr: |-
+ 100 * (
+ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
+ /
+ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
+ ) < 15
+ and
+ predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}[6h], 4 * 24 * 3600) < 0
+ for: 5m
+ labels:
+ severity: critical
+ - alert: KubePersistentVolumeErrors
+ annotations:
+ message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml
new file mode 100755
index 00000000..36a11931
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml
@@ -0,0 +1,145 @@
+{{- /*
+Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: kubernetes-system
+ rules:
+ - alert: KubeNodeNotReady
+ annotations:
+ message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
+ expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeVersionMismatch
+ annotations:
+ message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
+ expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
+ for: 1h
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.'
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
+ expr: |-
+ (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
+ /
+ sum(rate(rest_client_requests_total[5m])) by (instance, job))
+ * 100 > 1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeClientErrors
+ annotations:
+ message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
+ expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeletTooManyPods
+ annotations:
+ message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
+ expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
+ for: 15m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeAPILatencyHigh
+ annotations:
+ message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
+ expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
+ for: 10m
+ labels:
+ severity: critical
+ - alert: KubeAPIErrorsHigh
+ annotations:
+ message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
+ expr: |-
+ sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
+ /
+ sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
+ for: 10m
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
+ labels:
+ severity: warning
+ - alert: KubeClientCertificateExpiration
+ annotations:
+ message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
+ runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
+ labels:
+ severity: critical
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-network.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-network.yaml
new file mode 100755
index 00000000..1de2a621
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-network.yaml
@@ -0,0 +1,48 @@
+{{- /*
+Generated from 'node-network' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node-network
+ rules:
+ - alert: NetworkReceiveErrors
+ annotations:
+ message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing receive errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
+ expr: rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
+ for: 2m
+ labels:
+ severity: warning
+ - alert: NetworkTransmitErrors
+ annotations:
+ message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing transmit errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
+ expr: rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
+ for: 2m
+ labels:
+ severity: warning
+ - alert: NodeNetworkInterfaceFlapping
+ annotations:
+ message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
+ expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
+ for: 2m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-time.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-time.yaml
new file mode 100755
index 00000000..b53a6af2
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node-time.yaml
@@ -0,0 +1,34 @@
+{{- /*
+Generated from 'node-time' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.time }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-time" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node-time
+ rules:
+ - alert: ClockSkewDetected
+ annotations:
+ message: Clock skew detected on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}. Ensure NTP is configured correctly on this host.
+ expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03
+ for: 2m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node.rules.yaml
new file mode 100755
index 00000000..bd2c50fe
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/node.rules.yaml
@@ -0,0 +1,202 @@
+{{- /*
+Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: node.rules
+ rules:
+ - expr: sum(min(kube_pod_info) by (node))
+ record: ':kube_pod_info_node_count:'
+ - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
+ record: 'node_namespace_pod:kube_pod_info:'
+ - expr: |-
+ count by (node) (sum by (node, cpu) (
+ node_cpu_seconds_total{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ ))
+ record: node:node_num_cpu:sum
+ - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
+ record: :node_cpu_utilisation:avg1m
+ - expr: |-
+ 1 - avg by (node) (
+ rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:)
+ record: node:node_cpu_utilisation:avg1m
+ - expr: |-
+ node:node_cpu_utilisation:avg1m
+ *
+ node:node_num_cpu:sum
+ /
+ scalar(sum(node:node_num_cpu:sum))
+ record: node:cluster_cpu_utilisation:ratio
+ - expr: |-
+ sum(node_load1{job="node-exporter"})
+ /
+ sum(node:node_num_cpu:sum)
+ record: ':node_cpu_saturation_load1:'
+ - expr: |-
+ sum by (node) (
+ node_load1{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ node:node_num_cpu:sum
+ record: 'node:node_cpu_saturation_load1:'
+ - expr: |-
+ 1 -
+ sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ /
+ sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: ':node_memory_utilisation:'
+ - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ record: :node_memory_MemFreeCachedBuffers_bytes:sum
+ - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
+ record: :node_memory_MemTotal_bytes:sum
+ - expr: |-
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_available:sum
+ - expr: |-
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_bytes_total:sum
+ - expr: |-
+ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
+ /
+ node:node_memory_bytes_total:sum
+ record: node:node_memory_utilisation:ratio
+ - expr: |-
+ (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
+ /
+ scalar(sum(node:node_memory_bytes_total:sum))
+ record: node:cluster_memory_utilisation:ratio
+ - expr: |-
+ 1e3 * sum(
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ )
+ record: :node_memory_swap_io_bytes:sum_rate
+ - expr: |-
+ 1 -
+ sum by (node) (
+ (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ /
+ sum by (node) (
+ node_memory_MemTotal_bytes{job="node-exporter"}
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: 'node:node_memory_utilisation:'
+ - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
+ record: 'node:node_memory_utilisation_2:'
+ - expr: |-
+ 1e3 * sum by (node) (
+ (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_memory_swap_io_bytes:sum_rate
+ - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
+ record: :node_disk_utilisation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_utilisation:avg_irate
+ - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
+ record: :node_disk_saturation:avg_irate
+ - expr: |-
+ avg by (node) (
+ irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_disk_saturation:avg_irate
+ - expr: |-
+ max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
+ - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_usage:'
+ - expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
+ record: 'node:node_filesystem_avail:'
+ - expr: |-
+ sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
+ sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
+ record: :node_net_utilisation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
+ irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_utilisation:sum_irate
+ - expr: |-
+ sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
+ sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
+ record: :node_net_saturation:sum_irate
+ - expr: |-
+ sum by (node) (
+ (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
+ irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
+ * on (namespace, pod) group_left(node)
+ node_namespace_pod:kube_pod_info:
+ )
+ record: node:node_net_saturation:sum_irate
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_total:'
+ - expr: |-
+ max(
+ max(
+ kube_pod_info{job="kube-state-metrics", host_ip!=""}
+ ) by (node, host_ip)
+ * on (host_ip) group_right (node)
+ label_replace(
+ (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
+ )
+ ) by (node)
+ record: 'node:node_inodes_free:'
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml
new file mode 100755
index 00000000..9975be36
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus-operator.yaml
@@ -0,0 +1,43 @@
+{{- /*
+Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
+{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus-operator
+ rules:
+ - alert: PrometheusOperatorReconcileErrors
+ annotations:
+ message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
+ expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusOperatorNodeLookupErrors
+ annotations:
+ message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
+ expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml
new file mode 100755
index 00000000..9cd2eea0
--- /dev/null
+++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/prometheus.rules.yaml
@@ -0,0 +1,109 @@
+{{- /*
+Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
+Do not change in-place! In order to change this file first read following link:
+https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
+*/ -}}
+{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
+{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
+{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
+{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
+ namespace: {{ template "prometheus-operator.namespace" . }}
+ labels:
+ app: {{ template "prometheus-operator.name" . }}
+{{ include "prometheus-operator.labels" . | indent 4 }}
+{{- if .Values.defaultRules.labels }}
+{{ toYaml .Values.defaultRules.labels | indent 4 }}
+{{- end }}
+{{- if .Values.defaultRules.annotations }}
+ annotations:
+{{ toYaml .Values.defaultRules.annotations | indent 4 }}
+{{- end }}
+spec:
+ groups:
+ - name: prometheus.rules
+ rules:
+ - alert: PrometheusConfigReloadFailed
+ annotations:
+ description: Reloading Prometheus' configuration has failed for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}}
+ summary: Reloading Prometheus' configuration failed
+ expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusNotificationQueueRunningFull
+ annotations:
+ description: Prometheus' alert notification queue is running full for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}
+ summary: Prometheus' alert notification queue is running full
+ expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}}
+ summary: Errors while sending alert from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusErrorSendingAlerts
+ annotations:
+ description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}}
+ summary: Errors while sending alerts from Prometheus
+ expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03
+ for: 10m
+ labels:
+ severity: critical
+ - alert: PrometheusNotConnectedToAlertmanagers
+ annotations:
+ description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} is not connected to any Alertmanagers
+ summary: Prometheus is not connected to any Alertmanagers
+ expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBReloadsFailing
+ annotations:
+ description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} reload failures over the last four hours.'
+ summary: Prometheus has issues reloading data blocks from disk
+ expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBCompactionsFailing
+ annotations:
+ description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last four hours.'
+ summary: Prometheus has issues compacting sample blocks
+ expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
+ for: 12h
+ labels:
+ severity: warning
+ - alert: PrometheusTSDBWALCorruptions
+ annotations:
+ description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} has a corrupted write-ahead log (WAL).'
+ summary: Prometheus write-ahead log is corrupted
+ expr: prometheus_tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0
+ for: 4h
+ labels:
+ severity: warning
+ - alert: PrometheusNotIngestingSamples
+ annotations:
+ description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} isn't ingesting samples.
+ summary: Prometheus isn't ingesting samples
+ expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
+ for: 10m
+ labels:
+ severity: warning
+ - alert: PrometheusTargetScrapesDuplicate
+ annotations:
+ description: '{{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has many samples rejected due to duplicate timestamps but different values'
+ summary: Prometheus has many samples rejected
+ expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
+ for: 10m
+ labels:
+ severity: warning
+{{- end }} \ No newline at end of file