aboutsummaryrefslogtreecommitdiffstats
path: root/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules-1.14/etcd.yaml
blob: 97a9825d8ddbc5f8e5f4315975e5928549fbe196 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
{{- /*
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/master/Documentation/op-guide/etcd3_alert.rules.yml
Do not change in-place! In order to change this file first read following link:
https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
  namespace: {{ template "prometheus-operator.namespace" . }}
  labels:
    app: {{ template "prometheus-operator.name" . }}
{{ include "prometheus-operator.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
  annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
  groups:
  - name: etcd
    rules:
    - alert: etcdMembersDown
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
      expr: |-
        max by (job) (
          sum by (job) (up{job=~".*etcd.*"} == bool 0)
        or
          count by (job,endpoint) (
            sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
          )
        )
        > 0
      for: 3m
      labels:
        severity: critical
    - alert: etcdInsufficientMembers
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
      expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
      for: 3m
      labels:
        severity: critical
    - alert: etcdNoLeader
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
      expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
      for: 1m
      labels:
        severity: critical
    - alert: etcdHighNumberOfLeaderChanges
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
      expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
      for: 5m
      labels:
        severity: warning
    - alert: etcdHighNumberOfFailedGRPCRequests
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
          /
        sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
          > 1
      for: 10m
      labels:
        severity: warning
    - alert: etcdHighNumberOfFailedGRPCRequests
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
          /
        sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
          > 5
      for: 5m
      labels:
        severity: critical
    - alert: etcdGRPCRequestsSlow
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
        > 0.15
      for: 10m
      labels:
        severity: critical
    - alert: etcdMemberCommunicationSlow
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
        > 0.15
      for: 10m
      labels:
        severity: warning
    - alert: etcdHighNumberOfFailedProposals
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
      for: 15m
      labels:
        severity: warning
    - alert: etcdHighFsyncDurations
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
        > 0.5
      for: 10m
      labels:
        severity: warning
    - alert: etcdHighCommitDurations
      annotations:
        message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
        > 0.25
      for: 10m
      labels:
        severity: warning
    - alert: etcdHighNumberOfFailedHTTPRequests
      annotations:
        message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
      expr: |-
        sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
        BY (method) > 0.01
      for: 10m
      labels:
        severity: warning
    - alert: etcdHighNumberOfFailedHTTPRequests
      annotations:
        message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
      expr: |-
        sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
        BY (method) > 0.05
      for: 10m
      labels:
        severity: critical
    - alert: etcdHTTPRequestsSlow
      annotations:
        message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
      expr: |-
        histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
        > 0.15
      for: 10m
      labels:
        severity: warning
{{- end }}