aboutsummaryrefslogtreecommitdiffstats
path: root/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml
blob: e2d8a68df25e901a372842ffb7e93dca1f0dfdab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
{{- if and .Values.defaultRules.create }}
{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }}
apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
kind: PrometheusRule
metadata:
  name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
  labels:
    app: {{ template "prometheus-operator.name" . }}
{{ include "prometheus-operator.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
  annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
  groups:
  - name: prometheus.rules
    rules:
    - alert: PrometheusConfigReloadFailed
      annotations:
        description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}}
        summary: Reloading Prometheus' configuration failed
      expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusNotificationQueueRunningFull
      annotations:
        description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}}
        summary: Prometheus' alert notification queue is running full
      expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"}
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusErrorSendingAlerts
      annotations:
        description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
        summary: Errors while sending alert from Prometheus
      expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusErrorSendingAlerts
      annotations:
        description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}}
        summary: Errors while sending alerts from Prometheus
      expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03
      for: 10m
      labels:
        severity: critical
    - alert: PrometheusNotConnectedToAlertmanagers
      annotations:
        description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers
        summary: Prometheus is not connected to any Alertmanagers
      expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusTSDBReloadsFailing
      annotations:
        description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.'
        summary: Prometheus has issues reloading data blocks from disk
      expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0
      for: 12h
      labels:
        severity: warning
    - alert: PrometheusTSDBCompactionsFailing
      annotations:
        description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.'
        summary: Prometheus has issues compacting sample blocks
      expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0
      for: 12h
      labels:
        severity: warning
    - alert: PrometheusTSDBWALCorruptions
      annotations:
        description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).'
        summary: Prometheus write-ahead log is corrupted
      expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0
      for: 4h
      labels:
        severity: warning
    - alert: PrometheusNotIngestingSamples
      annotations:
        description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples.
        summary: Prometheus isn't ingesting samples
      expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0
      for: 10m
      labels:
        severity: warning
    - alert: PrometheusTargetScrapesDuplicate
      annotations:
        description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values'
        summary: Prometheus has many samples rejected
      expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0
      for: 10m
      labels:
        severity: warning
{{- end }}