{{- /* Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml Do not change in-place! In order to change this file first read following link: https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }} {{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }} {{- $namespace := printf "%s" (include "prometheus-operator.namespace" .) }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }} namespace: {{ template "prometheus-operator.namespace" . }} labels: app: {{ template "prometheus-operator.name" . }} {{ include "prometheus-operator.labels" . | indent 4 }} {{- if .Values.defaultRules.labels }} {{ toYaml .Values.defaultRules.labels | indent 4 }} {{- end }} {{- if .Values.defaultRules.annotations }} annotations: {{ toYaml .Values.defaultRules.annotations | indent 4 }} {{- end }} spec: groups: - name: prometheus.rules rules: - alert: PrometheusConfigReloadFailed annotations: description: Reloading Prometheus' configuration has failed for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} summary: Reloading Prometheus' configuration failed expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0 for: 10m labels: severity: warning - alert: PrometheusNotificationQueueRunningFull annotations: description: Prometheus' alert notification queue is running full for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} summary: Prometheus' alert notification queue is running full expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} for: 10m labels: severity: warning - alert: PrometheusErrorSendingAlerts annotations: description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} summary: Errors while sending alert from Prometheus expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01 for: 10m labels: severity: warning - alert: PrometheusErrorSendingAlerts annotations: description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}} summary: Errors while sending alerts from Prometheus expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03 for: 10m labels: severity: critical - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} is not connected to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1 for: 10m labels: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 for: 12h labels: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0 for: 12h labels: severity: warning - alert: PrometheusTSDBWALCorruptions annotations: description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} has a corrupted write-ahead log (WAL).' summary: Prometheus write-ahead log is corrupted expr: prometheus_tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0 for: 4h labels: severity: warning - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} isn't ingesting samples. summary: Prometheus isn't ingesting samples expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 for: 10m labels: severity: warning - alert: PrometheusTargetScrapesDuplicate annotations: description: '{{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has many samples rejected due to duplicate timestamps but different values' summary: Prometheus has many samples rejected expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m labels: severity: warning {{- end }}