1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
{{- /*
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "prometheus-operator.namespace" . }}
labels:
app: {{ template "prometheus-operator.name" . }}
{{ include "prometheus-operator.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: kubernetes-system
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
- alert: KubeVersionMismatch
annotations:
message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
for: 1h
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
expr: |-
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
expr: |-
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
expr: |-
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
expr: |-
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
expr: |-
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
for: 10m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
{{- end }}
|