1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
# Generated from 'node.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml
{{- if and .Values.defaultRules.create .Values.nodeExporter.enabled }}
apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }}
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
labels:
app: {{ template "prometheus-operator.name" . }}
{{ include "prometheus-operator.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
record: ':kube_pod_info_node_count:'
- expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |-
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilisation:avg1m
- expr: |-
1 - avg by (node) (
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilisation:avg1m
- expr: |-
sum(node_load1{job="node-exporter"})
/
sum(node:node_num_cpu:sum)
record: ':node_cpu_saturation_load1:'
- expr: |-
sum by (node) (
node_load1{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
record: 'node:node_cpu_saturation_load1:'
- expr: |-
1 -
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
/
sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: ':node_memory_utilisation:'
- expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
record: :node_memory_MemFreeCachedBuffers_bytes:sum
- expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: :node_memory_MemTotal_bytes:sum
- expr: |-
sum by (node) (
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |-
sum by (node) (
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_total:sum
- expr: |-
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
record: node:node_memory_utilisation:ratio
- expr: |-
1e3 * sum(
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
)
record: :node_memory_swap_io_bytes:sum_rate
- expr: |-
1 -
sum by (node) (
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: 'node:node_memory_utilisation:'
- expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
record: 'node:node_memory_utilisation_2:'
- expr: |-
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
record: :node_disk_utilisation:avg_irate
- expr: |-
avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |-
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |-
max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |-
sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_utilisation:sum_irate
- expr: |-
sum by (node) (
(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilisation:sum_irate
- expr: |-
sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |-
sum by (node) (
(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_saturation:sum_irate
- expr: |-
max(
max(
kube_pod_info{job="kube-state-metrics", host_ip!=""}
) by (node, host_ip)
* on (host_ip) group_right (node)
label_replace(
(max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
)
) by (node)
record: 'node:node_inodes_total:'
- expr: |-
max(
max(
kube_pod_info{job="kube-state-metrics", host_ip!=""}
) by (node, host_ip)
* on (host_ip) group_right (node)
label_replace(
(max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
)
) by (node)
record: 'node:node_inodes_free:'
{{- end }}
|