diff options
Diffstat (limited to 'vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml')
-rw-r--r-- | vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml new file mode 100644 index 00000000..e2d8a68d --- /dev/null +++ b/vnfs/DAaaS/prometheus-operator/templates/alertmanager/rules/prometheus.rules.yaml @@ -0,0 +1,102 @@ +# Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/coreos/prometheus-operator/master/contrib/kube-prometheus/manifests/prometheus-rules.yaml +{{- if and .Values.defaultRules.create }} +{{- $prometheusJob := printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus" }} +apiVersion: {{ printf "%s/v1" (.Values.prometheusOperator.crdApiGroup | default "monitoring.coreos.com") }} +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "prometheus-operator.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }} + labels: + app: {{ template "prometheus-operator.name" . }} +{{ include "prometheus-operator.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} + summary: Reloading Prometheus' configuration failed + expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} + summary: Prometheus' alert notification queue is running full + expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}} + summary: Errors while sending alert from Prometheus + expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{`{{$labels.namespace}}`}}/{{`{{ $labels.pod}}`}} to Alertmanager {{`{{$labels.Alertmanager}}`}} + summary: Errors while sending alerts from Prometheus + expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} is not connected to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} had {{`{{$value | humanize}}`}} compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{`{{$labels.job}}`}} at {{`{{$labels.instance}}`}} has a corrupted write-ahead log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: tsdb_wal_corruptions_total{job="{{ $prometheusJob }}"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod}}`}} isn't ingesting samples. + summary: Prometheus isn't ingesting samples + expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{`{{$labels.namespace}}`}}/{{`{{$labels.pod}}`}} has many samples rejected due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}"}[5m]) > 0 + for: 10m + labels: + severity: warning +{{- end }}
\ No newline at end of file |