diff options
Diffstat (limited to 'kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14')
25 files changed, 2358 insertions, 0 deletions
diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml new file mode 100755 index 00000000..1c6db409 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -0,0 +1,54 @@ +{{- /* +Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} +{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }} +{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }} +{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync. + expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. + expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: |- + alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + for: 5m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml new file mode 100755 index 00000000..97a9825d --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml @@ -0,0 +1,155 @@ +{{- /* +Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: etcd + rules: + - alert: etcdMembersDown + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' + expr: |- + max by (job) ( + sum by (job) (up{job=~".*etcd.*"} == bool 0) + or + count by (job,endpoint) ( + sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01 + ) + ) + > 0 + for: 3m + labels: + severity: critical + - alert: etcdInsufficientMembers + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' + expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) + for: 3m + labels: + severity: critical + - alert: etcdNoLeader + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + - alert: etcdHighNumberOfLeaderChanges + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3 + for: 5m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 1 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 5 + for: 5m + labels: + severity: critical + - alert: etcdGRPCRequestsSlow + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) + > 0.15 + for: 10m + labels: + severity: critical + - alert: etcdMemberCommunicationSlow + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.15 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedProposals + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.5 + for: 10m + labels: + severity: warning + - alert: etcdHighCommitDurations + annotations: + message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.25 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedHTTPRequests + annotations: + message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}' + expr: |- + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedHTTPRequests + annotations: + message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + expr: |- + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.05 + for: 10m + labels: + severity: critical + - alert: etcdHTTPRequestsSlow + annotations: + message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow. + expr: |- + histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml new file mode 100755 index 00000000..4ccd9441 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml @@ -0,0 +1,50 @@ +{{- /* +Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: 'This is an alert meant to ensure that the entire alerting pipeline is functional. + + This alert is always firing, therefore it should always be firing in Alertmanager + + and always fire against a receiver. There are integrations with various notification + + mechanisms that send a notification when this alert is not firing. For example the + + "DeadMansSnitch" integration in PagerDuty. + + ' + expr: vector(1) + labels: + severity: none +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml new file mode 100755 index 00000000..4bc2cc7d --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/k8s.rules.yaml @@ -0,0 +1,121 @@ +{{- /* +Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules + rules: + - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: |- + sum by (cluster, namespace, pod, container) ( + rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + - expr: |- + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: |- + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: |- + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: |- + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: |- + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_memory_bytes:sum + - expr: |- + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_cpu_cores:sum + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: mixin_pod_workload + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: mixin_pod_workload + - expr: |- + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: mixin_pod_workload +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml new file mode 100755 index 00000000..010d3446 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -0,0 +1,71 @@ +{{- /* +Generated from 'kube-apiserver-slos' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverSlos }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver-slos" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + expr: |- + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + severity: critical + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + expr: |- + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + severity: critical + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + expr: |- + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + severity: warning + - alert: KubeAPIErrorBudgetBurn + annotations: + message: The API server is burning too much error budget + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorbudgetburn + expr: |- + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml new file mode 100755 index 00000000..1b00134e --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-apiserver.rules.yaml @@ -0,0 +1,393 @@ +{{- /* +Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-apiserver.rules + rules: + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[1h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[2h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[3d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[5m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) + - + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[6h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: |- + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h + - expr: |- + 1 - ( + ( + # write too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + ( + # read too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + - + ( + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + ) + + # errors + sum(code:apiserver_request_total:increase30d{code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: |- + 1 - ( + sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + - + ( + # too slow + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="resource",le="0.1"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: |- + 1 - ( + ( + # too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."}) + ) + / + sum(code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: sum by (code, verb) (increase(apiserver_request_total{job="apiserver"}[30d])) + record: code_verb:apiserver_request_total:increase30d + - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: |- + sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod) + record: cluster:apiserver_request_duration_seconds:mean5m + - expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml new file mode 100755 index 00000000..0b963276 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml @@ -0,0 +1,31 @@ +{{- /* +Generated from 'kube-prometheus-general.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-general.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml new file mode 100755 index 00000000..1ff4cb07 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml @@ -0,0 +1,41 @@ +{{- /* +Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml new file mode 100755 index 00000000..ec718cef --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-scheduler.rules.yaml @@ -0,0 +1,63 @@ +{{- /* +Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-scheduler.rules + rules: + - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml new file mode 100755 index 00000000..6f281bcb --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -0,0 +1,51 @@ +{{- /* +Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricslisterrors + expr: |- + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricswatcherrors + expr: |- + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml new file mode 100755 index 00000000..9d9fa950 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubelet.rules.yaml @@ -0,0 +1,39 @@ +{{- /* +Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubelet.rules + rules: + - expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.99' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.9' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.5' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml new file mode 100755 index 00000000..3ae09119 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -0,0 +1,205 @@ +{{- /* +Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeStateMetrics.enabled .Values.defaultRules.rules.kubernetesApps }} +{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping + expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0 + for: 15m + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready + expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 + for: 15m + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch + expr: |- + ( + kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch + expr: |- + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout + expr: |- + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + ) + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{`{{`}} $value | humanizePercentage {{`}}`}} of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck + expr: |- + kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} < 1.00 + for: 15m + labels: + severity: critical + - alert: KubeContainerWaiting + annotations: + message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting + expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled + expr: |- + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning + expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion + expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed + expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + message: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch + expr: |- + (kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + != + kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + and + changes(kube_hpa_status_current_replicas[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + message: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has been running at max replicas for longer than 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout + expr: |- + kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + == + kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + for: 15m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml new file mode 100755 index 00000000..0247f5eb --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -0,0 +1,103 @@ +{{- /* +Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit + expr: |- + sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) + / + sum(kube_node_status_allocatable_cpu_cores) + > + (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) + for: 5m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryovercommit + expr: |- + sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) + / + sum(kube_node_status_allocatable_memory_bytes) + > + (count(kube_node_status_allocatable_memory_bytes)-1) + / + count(kube_node_status_allocatable_memory_bytes) + for: 5m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuquotaovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(kube_node_status_allocatable_cpu_cores) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememoryquotaovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml new file mode 100755 index 00000000..f2573966 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -0,0 +1,63 @@ +{{- /* +Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }} +{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup + expr: |- + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + < 0.03 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} + ) < 0.15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml new file mode 100755 index 00000000..7583a599 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -0,0 +1,100 @@ +{{- /* +Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system-apiserver + rules: + - alert: KubeAPILatencyHigh + annotations: + message: The API server has an abnormal latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh + expr: |- + ( + cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} + > + on (verb) group_left() + ( + avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + + + 2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + ) + ) > on (verb) group_left() + 1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0) + and on (verb,resource) + cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} + > + 1 + for: 5m + labels: + severity: warning + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{`{{`}} $value | humanizePercentage {{`}}`}} of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh + expr: |- + sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb) + / + sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - alert: AggregatedAPIErrors + annotations: + message: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapierrors + expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + labels: + severity: warning + - alert: AggregatedAPIDown + annotations: + message: An aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} is down. It has not been available at least for the past five minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-aggregatedapidown + expr: sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0 + for: 5m + labels: + severity: warning +{{- if .Values.kubeApiServer.enabled }} + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown + expr: absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml new file mode 100755 index 00000000..6214d775 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -0,0 +1,37 @@ +{{- /* +Generated from 'kubernetes-system-controller-manager' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeControllerManager.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-controller-manager" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system-controller-manager + rules: +{{- if .Values.kubeControllerManager.enabled }} + - alert: KubeControllerManagerDown + annotations: + message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml new file mode 100755 index 00000000..bb7a2838 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -0,0 +1,84 @@ +{{- /* +Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready + expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + message: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodeunreachable + expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1 + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods + expr: max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95 + for: 15m + labels: + severity: warning + - alert: KubeNodeReadinessFlapping + annotations: + message: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodereadinessflapping + expr: sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletplegdurationhigh + expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + message: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletpodstartuplatencyhigh + expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning +{{- if .Values.prometheusOperator.kubeletService.enabled }} + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown + expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- end }} diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml new file mode 100755 index 00000000..84e3b1ab --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -0,0 +1,37 @@ +{{- /* +Generated from 'kubernetes-system-scheduler' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system-scheduler" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system-scheduler + rules: +{{- if .Values.kubeScheduler.enabled }} + - alert: KubeSchedulerDown + annotations: + message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown + expr: absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml new file mode 100755 index 00000000..fc455f37 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -0,0 +1,47 @@ +{{- /* +Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch + expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors + expr: |- + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + > 0.01 + for: 15m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml new file mode 100755 index 00000000..7adf85db --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.rules.yaml @@ -0,0 +1,79 @@ +{{- /* +Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: node-exporter.rules + rules: + - expr: |- + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{job="node-exporter"} + ) + ) + record: instance:node_num_cpu:sum + - expr: |- + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) + ) + record: instance:node_cpu_utilisation:rate1m + - expr: |- + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: |- + 1 - ( + node_memory_MemAvailable_bytes{job="node-exporter"} + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) + record: instance:node_vmstat_pgmajfault:rate1m + - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_seconds:rate1m + - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate1m + - expr: |- + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate1m + - expr: |- + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate1m + - expr: |- + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate1m + - expr: |- + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate1m +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml new file mode 100755 index 00000000..7b6c601b --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-exporter.yaml @@ -0,0 +1,202 @@ +{{- /* +Generated from 'node-exporter' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: increase(node_network_receive_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: increase(node_network_transmit_errs_total[2m]) > 10 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used' + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclockskewdetected + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds > 0.05 + and + deriv(node_timex_offset_seconds[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds < -0.05 + and + deriv(node_timex_offset_seconds[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host. + runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeclocknotsynchronising + summary: Clock not synchronising. + expr: min_over_time(node_timex_sync_status[5m]) == 0 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml new file mode 100755 index 00000000..b4b206d8 --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node-network.yaml @@ -0,0 +1,34 @@ +{{- /* +Generated from 'node-network' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}" + expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml new file mode 100755 index 00000000..75ab27ed --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/node.rules.yaml @@ -0,0 +1,53 @@ +{{- /* +Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.nodeExporter.enabled .Values.defaultRules.rules.node }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (cluster, node)) + record: ':kube_pod_info_node_count:' + - expr: |- + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: |- + count by (cluster, node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: |- + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml new file mode 100755 index 00000000..98f2d3bc --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -0,0 +1,43 @@ +{{- /* +Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }} +{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }} +{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorReconcileErrors + annotations: + message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace. + expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace. + expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml new file mode 100755 index 00000000..71883f7e --- /dev/null +++ b/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/prometheus.yaml @@ -0,0 +1,202 @@ +{{- /* +Generated from 'prometheus' group from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/prometheus-rules.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }} +{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }} +{{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "prometheus-operator.namespace" . }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration. + summary: Failed Prometheus configuration reload. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full. + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: |- + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.' + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: |- + ( + rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + / + rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.' + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: |- + min without(alertmanager) ( + rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + / + rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers. + summary: Prometheus is not connected to any Alertmanagers. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h. + summary: Prometheus has issues reloading blocks from disk. + expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h. + summary: Prometheus has issues compacting blocks. + expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples. + summary: Prometheus is not ingesting samples. + expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with different values but duplicated timestamp. + summary: Prometheus is dropping samples with duplicate timestamps. + expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value {{`}}`}} samples/s with timestamps arriving out of order. + summary: Prometheus drops samples with out-of-order timestamps. + expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}} + summary: Prometheus fails to send samples to remote storage. + expr: |- + ( + rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + / + ( + rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + + + rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}. + summary: Prometheus remote write is behind. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + - on(job, instance) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}. + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m. + summary: Prometheus is failing rule evaluations. + expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m. + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 + for: 15m + labels: + severity: warning +{{- end }}
\ No newline at end of file |