diff options
Diffstat (limited to 'vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules')
16 files changed, 1295 insertions, 0 deletions
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml new file mode 100644 index 00000000..f196db48 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/alertmanager.rules.yaml @@ -0,0 +1,47 @@ +# Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }} +{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{`{{$labels.service}}`}}` are out of sync. + expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}}. + expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: |- + alertmanager_cluster_members{job="{{ $alertmanagerJob }}"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}"}) + for: 5m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml new file mode 100644 index 00000000..dd0140db --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/etcd.yaml @@ -0,0 +1,134 @@ +# Generated from 'etcd' group from https://raw.githubusercontent.com/paskal/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml +{{- if and .Values.defaultRules.create .Values.kubeEtcd.enabled }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: etcd + rules: + - alert: etcdInsufficientMembers + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": insufficient members ({{`{{ $value }}`}}).' + expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) + for: 3m + labels: + severity: critical + - alert: etcdNoLeader + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": member {{`{{ $labels.instance }}`}} has no leader.' + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + - alert: etcdHighNumberOfLeaderChanges + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": instance {{`{{ $labels.instance }}`}} has seen {{`{{ $value }}`}} leader changes within the last hour.' + expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 + for: 15m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 1 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}}% of requests for {{`{{ $labels.grpc_method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) + > 5 + for: 5m + labels: + severity: critical + - alert: etcdGRPCRequestsSlow + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": gRPC requests to {{`{{ $labels.grpc_method }}`}} are taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) + > 0.15 + for: 10m + labels: + severity: critical + - alert: etcdMemberCommunicationSlow + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": member communication with {{`{{ $labels.To }}`}} is taking {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.15 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedProposals + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": {{`{{ $value }}`}} proposal failures within the last hour on etcd instance {{`{{ $labels.instance }}`}}.' + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile fync durations are {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.5 + for: 10m + labels: + severity: warning + - alert: etcdHighCommitDurations + annotations: + message: 'etcd cluster "{{`{{ $labels.job }}`}}": 99th percentile commit durations {{`{{ $value }}`}}s on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.25 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedHTTPRequests + annotations: + message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}' + expr: |- + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedHTTPRequests + annotations: + message: '{{`{{ $value }}`}}% of requests for {{`{{ $labels.method }}`}} failed on etcd instance {{`{{ $labels.instance }}`}}.' + expr: |- + sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) + BY (method) > 0.05 + for: 10m + labels: + severity: critical + - alert: etcdHTTPRequestsSlow + annotations: + message: etcd instance {{`{{ $labels.instance }}`}} HTTP requests to {{`{{ $labels.method }}`}} are slow. + expr: |- + histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml new file mode 100644 index 00000000..020c29eb --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/general.rules.yaml @@ -0,0 +1,34 @@ +# Generated from 'general.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{`{{ $value }}`}}% of the {{`{{ $labels.job }}`}} targets are down.' + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: DeadMansSwitch + annotations: + message: This is a DeadMansSwitch meant to ensure that the entire alerting pipeline is functional. + expr: vector(1) + labels: + severity: none +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml new file mode 100644 index 00000000..620bd15b --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/k8s.rules.yaml @@ -0,0 +1,58 @@ +# Generated from 'k8s.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules + rules: + - expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: |- + sum by (namespace, pod_name, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate + - expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: |- + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: |- + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: |- + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: |- + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml new file mode 100644 index 00000000..d1db5296 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-apiserver.rules.yaml @@ -0,0 +1,33 @@ +# Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create .Values.kubeApiServer.enabled }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-apiserver.rules + rules: + - expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.99' + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.9' + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.5' + record: cluster_quantile:apiserver_request_latencies:histogram_quantile +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml new file mode 100644 index 00000000..d0a643b7 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-alerting.rules.yaml @@ -0,0 +1,35 @@ +# Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-prometheus-node-alerting.rules + rules: + - alert: NodeDiskRunningFull + annotations: + message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 24 hours. + expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)' + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + message: Device {{`{{ $labels.device }}`}} of node-exporter {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} will be full within the next 2 hours. + expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)' + for: 10m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml new file mode 100644 index 00000000..87d3556a --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-prometheus-node-recording.rules.yaml @@ -0,0 +1,35 @@ +# Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + record: cluster:node_cpu:ratio +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml new file mode 100644 index 00000000..3a279661 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kube-scheduler.rules.yaml @@ -0,0 +1,57 @@ +# Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create .Values.kubeScheduler.enabled }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kube-scheduler.rules + rules: + - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_binding_latency:histogram_quantile +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml new file mode 100644 index 00000000..37fc5465 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-absent.yaml @@ -0,0 +1,120 @@ +# Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }} +{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }} +{{- $alertmanagerJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "alertmanager" }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-absent + rules: + - alert: AlertmanagerDown + annotations: + message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown + expr: absent(up{job="{{ $alertmanagerJob }}"} == 1) + for: 15m + labels: + severity: critical +{{- if .Values.kubeDns.enabled }} + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown + expr: absent(up{job="kube-dns"} == 1) + for: 15m + labels: + severity: critical +{{- if .Values.kubeApiServer.enabled }} +{{- end }} + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown + expr: absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- if .Values.kubeControllerManager.enabled }} + - alert: KubeControllerManagerDown + annotations: + message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- if .Values.kubeScheduler.enabled }} + - alert: KubeSchedulerDown + annotations: + message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown + expr: absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- if .Values.kubeStateMetrics.enabled }} + - alert: KubeStateMetricsDown + annotations: + message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown + expr: absent(up{job="kube-state-metrics"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- if .Values.prometheusOperator.kubeletService.enabled }} + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown + expr: absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- if .Values.nodeExporter.enabled }} + - alert: NodeExporterDown + annotations: + message: NodeExporter has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown + expr: absent(up{job="node-exporter"} == 1) + for: 15m + labels: + severity: critical +{{- end }} + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown + expr: absent(up{job="{{ $prometheusJob }}"} == 1) + for: 15m + labels: + severity: critical +{{- if .Values.prometheusOperator.enabled }} + - alert: PrometheusOperatorDown + annotations: + message: PrometheusOperator has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown + expr: absent(up{job="{{ $operatorJob }}"} == 1) + for: 15m + labels: + severity: critical +{{- end }} +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml new file mode 100644 index 00000000..21549c23 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-apps.yaml @@ -0,0 +1,154 @@ +# Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create .Values.kubeStateMetrics.enabled }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} ({{`{{ $labels.container }}`}}) is restarting {{`{{ printf "%.2f" $value }}`}} times / 5 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping + expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} has been in a non-ready state for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready + expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} has not matched the expected number of replicas for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch + expr: |- + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + for: 1h + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch + expr: |- + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet generation for {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.statefulset }}`}} update has not been rolled out. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + expr: |- + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{`{{ $value }}`}}% of the desired Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are scheduled and ready. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck + expr: |- + kube_daemonset_status_number_ready{job="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are not scheduled.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled + expr: |- + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: '{{`{{ $value }}`}} Pods of DaemonSet {{`{{ $labels.namespace }}`}}/{{`{{ $labels.daemonset }}`}} are running where they are not supposed to run.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{`{{ $labels.namespace }}`}}/{{`{{ $labels.cronjob }}`}} is taking more than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} is taking more than one hour to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{`{{ $labels.namespace }}`}}/{{`{{ $labels.job_name }}`}} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml new file mode 100644 index 00000000..4a7b9f95 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-resources.yaml @@ -0,0 +1,93 @@ +# Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: |- + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: |- + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal_bytes) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + expr: |- + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{`{{ printf "%0.0f" $value }}`}}% throttling of CPU in namespace {{`{{ $labels.namespace }}`}} for container {{`{{ $labels.container_name }}`}} in pod {{`{{ $labels.pod_name }}`}}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25" + for: 15m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml new file mode 100644 index 00000000..d290f0cf --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-storage.yaml @@ -0,0 +1,56 @@ +# Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ printf "%0.2f" $value }}`}}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical + expr: |- + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up within four days. Currently {{`{{ printf "%0.2f" $value }}`}}% is available. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays + expr: |- + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) < 15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml new file mode 100644 index 00000000..78f90b79 --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/kubernetes-system.yaml @@ -0,0 +1,117 @@ +# Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{`{{ $labels.node }}`}} has been unready for more than an hour.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready + expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{`{{ $value }}`}} different versions of Kubernetes components running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch + expr: count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: |- + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{`{{ $labels.job }}`}}/{{`{{ $labels.instance }}`}}' is experiencing {{`{{ printf "%0.0f" $value }}`}} errors / second. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{`{{ $labels.instance }}`}} is running {{`{{ $value }}`}} Pods, close to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods + expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{`{{ $value }}`}} seconds for {{`{{ $labels.verb }}`}} {{`{{ $labels.resource }}`}}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{`{{ $value }}`}}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: |- + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{`{{ $value }}`}}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: |- + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 24 hours. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml new file mode 100644 index 00000000..b039d1ac --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/node.rules.yaml @@ -0,0 +1,184 @@ +# Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create .Values.nodeExporter.enabled }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: |- + count by (node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilisation:avg1m + - expr: |- + 1 - avg by (node) ( + rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilisation:avg1m + - expr: |- + sum(node_load1{job="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: |- + sum by (node) ( + node_load1{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: |- + 1 - + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: ':node_memory_utilisation:' + - expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum + - expr: sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum + - expr: |- + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: |- + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: |- + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:node_memory_utilisation:ratio + - expr: |- + 1e3 * sum( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: |- + 1 - + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilisation:' + - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilisation_2:' + - expr: |- + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) + record: :node_disk_utilisation:avg_irate + - expr: |- + avg by (node) ( + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilisation:avg_irate + - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) + record: :node_disk_saturation:avg_irate + - expr: |- + avg by (node) ( + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: |- + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' + - expr: |- + sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_utilisation:sum_irate + - expr: |- + sum by (node) ( + (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilisation:sum_irate + - expr: |- + sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_saturation:sum_irate + - expr: |- + sum by (node) ( + (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - expr: |- + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: |- + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml new file mode 100644 index 00000000..7f19763e --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus-operator.yaml @@ -0,0 +1,36 @@ +# Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +{{- $operatorJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "operator" }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: prometheus-operator + rules: + - alert: PrometheusOperatorReconcileErrors + annotations: + message: Errors while reconciling {{`{{ $labels.controller }}`}} in {{`{{ $labels.namespace }}`}} Namespace. + expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{`{{ $labels.namespace }}`}} Namespace. + expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}"}[5m]) > 0.1 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml new file mode 100644 index 00000000..e2d8a68d --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml @@ -0,0 +1,102 @@ +# Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} + summary: Reloading Prometheus' configuration failed + expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} + summary: Prometheus' alert notification queue is running full + expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}} + summary: Errors while sending alert from Prometheus + expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}} + summary: Errors while sending alerts from Prometheus + expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples. + summary: Prometheus isn't ingesting samples + expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file |