aboutsummaryrefslogtreecommitdiffstats
path: root/kud/tests/vnfs/comp-app/collection/app2/helm/prometheus-operator/templates/prometheus/rules/kubernetes-system.yaml
blob: 36a11931452168a2cb7cb150cfd5dba6644ad24c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
{{- /*
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/coreos/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
  namespace: {{ template "prometheus-operator.namespace" . }}
  labels:
    app: {{ template "prometheus-operator.name" . }}
{{ include "prometheus-operator.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
  annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
  groups:
  - name: kubernetes-system
    rules:
    - alert: KubeNodeNotReady
      annotations:
        message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.'
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
      expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
      for: 1h
      labels:
        severity: warning
    - alert: KubeVersionMismatch
      annotations:
        message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
      expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
      for: 1h
      labels:
        severity: warning
    - alert: KubeClientErrors
      annotations:
        message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.'
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
      expr: |-
        (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
          /
        sum(rate(rest_client_requests_total[5m])) by (instance, job))
        * 100 > 1
      for: 15m
      labels:
        severity: warning
    - alert: KubeClientErrors
      annotations:
        message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
      expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
      for: 15m
      labels:
        severity: warning
    - alert: KubeletTooManyPods
      annotations:
        message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
      expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
      for: 15m
      labels:
        severity: warning
    - alert: KubeAPILatencyHigh
      annotations:
        message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
      expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
      for: 10m
      labels:
        severity: warning
    - alert: KubeAPILatencyHigh
      annotations:
        message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
      expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
      for: 10m
      labels:
        severity: critical
    - alert: KubeAPIErrorsHigh
      annotations:
        message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
      expr: |-
        sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
          /
        sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
      for: 10m
      labels:
        severity: critical
    - alert: KubeAPIErrorsHigh
      annotations:
        message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
      expr: |-
        sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
          /
        sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
      for: 10m
      labels:
        severity: warning
    - alert: KubeAPIErrorsHigh
      annotations:
        message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
      expr: |-
        sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
          /
        sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
      for: 10m
      labels:
        severity: critical
    - alert: KubeAPIErrorsHigh
      annotations:
        message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
      expr: |-
        sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
          /
        sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
      for: 10m
      labels:
        severity: warning
    - alert: KubeClientCertificateExpiration
      annotations:
        message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
      expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
      labels:
        severity: warning
    - alert: KubeClientCertificateExpiration
      annotations:
        message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
        runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
      expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
      labels:
        severity: critical
{{- end }}