From b88d3ec06abbb3e69e5a1446d15096622e791cce Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Sat, 3 Feb 2024 16:41:14 +0000 Subject: [PATCH] Add blackbox exporter addon (#241) * Add blackbox exporter addon * Fix typo in ingress alerts --- .github/workflows/update-addons.yml | 3 + .../blackbox-exporter-dashboard.json | 932 ++++++++++++++++++ .../cluster-addons/templates/cni/calico.yaml | 2 + .../templates/ingress-nginx.yaml | 26 +- .../monitoring/blackbox-exporter.yaml | 147 +++ .../monitoring/kube-prometheus-stack.yaml | 4 +- .../templates/monitoring/loki-stack.yaml | 6 + charts/cluster-addons/values.yaml | 15 + .../prometheus-blackbox-exporter.yaml | 4 + 9 files changed, 1130 insertions(+), 9 deletions(-) create mode 100644 charts/cluster-addons/grafana-dashboards/blackbox-exporter-dashboard.json create mode 100644 charts/cluster-addons/templates/monitoring/blackbox-exporter.yaml create mode 100644 skopeo-manifests/prometheus-blackbox-exporter.yaml diff --git a/.github/workflows/update-addons.yml b/.github/workflows/update-addons.yml index 8a43ccc..18a5480 100644 --- a/.github/workflows/update-addons.yml +++ b/.github/workflows/update-addons.yml @@ -61,6 +61,9 @@ jobs: - key: loki-stack path: monitoring.lokiStack.chart + - key: prometheus-blackbox-exporter + path: monitoring.blackboxExporter.chart + - key: node-feature-discovery path: nodeFeatureDiscovery.chart diff --git a/charts/cluster-addons/grafana-dashboards/blackbox-exporter-dashboard.json b/charts/cluster-addons/grafana-dashboards/blackbox-exporter-dashboard.json new file mode 100644 index 0000000..53143aa --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/blackbox-exporter-dashboard.json @@ -0,0 +1,932 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Blackbox exporter HTTP prober dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 13659, + "graphTooltip": 0, + "id": 34, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "SSL Cert Expiry (days)" + }, + "properties": [ + { + "id": "decimals", + "value": 0 + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(0, 0, 0, 0)", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 7 + }, + { + "color": "green", + "value": 30 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "min", + "value": 0 + }, + { + "id": "max", + "value": 365 + }, + { + "id": "custom.filterable", + "value": false + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "text": "DOWN" + }, + "1": { + "text": "UP" + } + }, + "type": "value" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "custom.width", + "value": 76 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Code" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(0, 0, 0, 0)", + "value": null + }, + { + "color": "green", + "value": 200 + }, + { + "color": "orange", + "value": 300 + }, + { + "color": "red", + "value": 500 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "text": "" + } + }, + "type": "value" + } + ] + }, + { + "id": "custom.width", + "value": 78 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "SSL" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "text": "NO" + }, + "1": { + "text": "OK" + } + }, + "type": "value" + } + ] + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(3, 3, 3, 0)", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "gradient", + "type": "color-background" + } + }, + { + "id": "custom.width", + "value": 77 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Probe Duration (s)" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.8 + }, + { + "color": "red", + "value": 2 + } + ] + } + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "custom.filterable", + "value": false + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "max", + "value": 3 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "DNS Lookup Duration (s)" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "color": "red", + "value": 0.2 + } + ] + } + }, + { + "id": "max", + "value": 0.3 + }, + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + }, + { + "id": "custom.filterable", + "value": false + }, + { + "id": "decimals", + "value": 3 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Target" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "targetBlank": true, + "title": "${__data.fields.Target}", + "url": "${__data.fields.Instance}" + } + ] + }, + { + "id": "custom.width", + "value": 276 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "TLS Version" + }, + "properties": [ + { + "id": "custom.width", + "value": 117 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "SSL Cert Expiry (days)" + } + ] + }, + "pluginVersion": "10.2.3", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "probe_success{target=~\"$target\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "probe_http_ssl{target=~\"$target\"} > 0", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "(probe_ssl_earliest_cert_expiry{target=~\"$target\"} - time()) / 3600 / 24", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "probe_http_status_code{target=~\"$target\"} > 0", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg_over_time(probe_duration_seconds{target=~\"$target\"}[1m])", + "format": "table", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "probe_tls_version_info{target=~\"$target\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg_over_time(probe_dns_lookup_time_seconds{target=~\"$target\"}[1m])", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "G" + } + ], + "title": "HTTP Probe Overview", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "target" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "Time 8": true, + "Value": false, + "Value #A": false, + "Value #B": false, + "Value #F": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "__name__ 5": true, + "__name__ 6": true, + "__name__ 7": true, + "container 1": true, + "container 2": true, + "container 3": true, + "container 4": true, + "container 5": true, + "container 6": true, + "container 7": true, + "endpoint 1": true, + "endpoint 2": true, + "endpoint 3": true, + "endpoint 4": true, + "endpoint 5": true, + "endpoint 6": true, + "endpoint 7": true, + "instance 1": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "job": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true, + "job 8": true, + "namespace 1": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "namespace 5": true, + "namespace 6": true, + "namespace 7": true, + "phase": true, + "pod 1": true, + "pod 2": true, + "pod 3": true, + "pod 4": true, + "pod 5": true, + "pod 6": true, + "pod 7": true, + "service 1": true, + "service 2": true, + "service 3": true, + "service 4": true, + "service 5": true, + "service 6": true, + "service 7": true, + "type": true, + "type 1": true, + "type 2": true, + "type 3": true, + "type 4": true, + "type 5": true, + "type 6": true, + "type 7": true, + "type 8": true, + "version": false + }, + "includeByName": {}, + "indexByName": { + "Time 1": 9, + "Time 2": 13, + "Time 3": 17, + "Time 4": 20, + "Time 5": 24, + "Time 6": 28, + "Time 7": 32, + "Value #A": 1, + "Value #B": 3, + "Value #C": 5, + "Value #D": 2, + "Value #E": 6, + "Value #F": 8, + "Value #G": 7, + "__name__ 1": 10, + "__name__ 2": 14, + "__name__ 3": 21, + "__name__ 4": 25, + "__name__ 5": 29, + "instance": 1, + "job 1": 11, + "job 2": 15, + "job 3": 18, + "job 4": 22, + "job 5": 26, + "job 6": 30, + "target": 0, + "type 1": 12, + "type 2": 16, + "type 3": 19, + "type 4": 23, + "type 5": 27, + "type 6": 31, + "version": 4 + }, + "renameByName": { + "Value": "Up", + "Value #A": "Status", + "Value #B": "SSL", + "Value #C": "SSL Cert Expiry (days)", + "Value #D": "Code", + "Value #E": "Probe Duration (s)", + "Value #F": "", + "Value #G": "DNS Lookup Duration (s)", + "Value #H": "Probe IP", + "instance": "URL", + "target": "Target", + "type 6": "", + "version": "TLS Version" + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 599, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed+area" + } + }, + "mappings": [], + "max": 600, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 300 + }, + { + "color": "red", + "value": 500 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(probe_http_status_code{target=~\"$target\"}) by(target)", + "instant": false, + "legendFormat": "{{target}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Status Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(probe_duration_seconds{target=~\"$target\"}) by(target)", + "hide": false, + "instant": false, + "legendFormat": "{{target}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Probe Duration", + "transformations": [], + "type": "timeseries" + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": [ + "blackbox", + "prometheus" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(probe_success, target)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "target", + "options": [], + "query": { + "query": "label_values(probe_success, target)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Blackbox Exporter (HTTP prober)", + "uid": "NEzutrbMk", + "version": 10, + "weekStart": "" +} diff --git a/charts/cluster-addons/templates/cni/calico.yaml b/charts/cluster-addons/templates/cni/calico.yaml index 92f6b28..30c53ef 100644 --- a/charts/cluster-addons/templates/cni/calico.yaml +++ b/charts/cluster-addons/templates/cni/calico.yaml @@ -153,6 +153,8 @@ spec: grafana_dashboard: "1" data: cni-calico-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/cni-calico-dashboard.json" | nindent 12 }} + {% endraw %} {{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index e6a3f64..544a0e3 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -65,13 +65,15 @@ spec: grafana_dashboard: "1" data: nginx-ingress-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/ingress-nginx-dashboard.json" | nindent 12 }} + {% endraw %} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: Manifests metadata: - name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-alerting-rules - labels: {{ include "cluster-addons.componentLabels" (list . "ingress-nginx") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "ingress-nginx-alerts") }} + labels: {{ include "cluster-addons.componentLabels" (list . "ingress-nginx-alerts") | nindent 4 }} annotations: # Tell Argo to ignore the non-controller owner references for this object argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" @@ -79,34 +81,42 @@ spec: clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true targetNamespace: {{ .Values.ingress.nginx.release.namespace }} - releaseName: ingress-nginx-alerting-rules + releaseName: ingress-nginx-alerts manifestSources: - template: | {% raw %} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-alerting-rules + name: ingress-nginx-alerts spec: groups: - name: ingress_nginx.rules rules: - alert: NginxIngressCertificateExpiresSoon expr: | - nginx_ingress_controller_ssl_expire_time_seconds < (time() + (30 * 24 * 3600)) - for: 1h + (7 * 24 * 3600) <= (last_over_time(nginx_ingress_controller_ssl_expire_time_seconds[10m]) - time()) < (30 * 24 * 3600) + for: 0m annotations: message: "The cert {{ "{{" }} $labels.name {{ "}}" }} is {{ "{{" }} $value | humanizeDuration {{ "}}" }} from expiry." labels: severity: warning - alert: NginxIngressCertificateExpiresVerySoon expr: | - nginx_ingress_controller_ssl_expire_time_seconds < (time() + (7 * 24 * 3600)) - for: 1m + 0 <= (last_over_time(nginx_ingress_controller_ssl_expire_time_seconds[10m]) - time()) < (7 * 24 * 3600) + for: 0m annotations: message: "The cert {{ "{{" }} $labels.name {{ "}}" }} is {{ "{{" }} $value | humanizeDuration {{ "}}" }} from expiry." labels: severity: critical + - alert: NginxIngressCertificateExpired + expr: | + (last_over_time(nginx_ingress_controller_ssl_expire_time_seconds[10m]) - time()) < 0 + for: 0m + annotations: + message: "The cert {{ "{{" }} $labels.name {{ "}}" }} has expired." + labels: + severity: critical {% endraw %} {{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/monitoring/blackbox-exporter.yaml b/charts/cluster-addons/templates/monitoring/blackbox-exporter.yaml new file mode 100644 index 0000000..623087e --- /dev/null +++ b/charts/cluster-addons/templates/monitoring/blackbox-exporter.yaml @@ -0,0 +1,147 @@ +{{- if and .Values.monitoring.enabled .Values.monitoring.blackboxExporter.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "blackbox-exporter") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + serviceMonitor: + enabled: true + {% if cloud_identity and "clouds.yaml" in cloud_identity.data %} + {% set clouds_data = cloud_identity.data["clouds.yaml"] | b64decode | fromyaml %} + targets: + {% for name, config in clouds_data.clouds.items() %} + - name: {{ "{{" }} name {{ "}}" }}-auth-url + url: {{ "{{" }} config.auth.auth_url {{ "}}" }} + {% endfor %} + {% endif %} + overrides: | + {{- toYaml .Values.monitoring.blackboxExporter.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter") }} + labels: {{ include "cluster-addons.componentLabels" (list . "blackbox-exporter") | nindent 4 }} + annotations: + # Tell Argo to ignore the non-controller owner references for this object + argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.monitoring.blackboxExporter.chart | nindent 4 }} + targetNamespace: {{ .Values.monitoring.blackboxExporter.release.namespace }} + releaseName: prometheus-blackbox-exporter + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter") }}-config + key: overrides +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter-dashboards") }} + labels: {{ include "cluster-addons.componentLabels" (list . "blackbox-exporter-dashboards") | nindent 4 }} + annotations: + # Tell Argo to ignore the non-controller owner references for this object + argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.monitoring.blackboxExporter.release.namespace }} + releaseName: blackbox-exporter-dashboards + manifestSources: + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: blackbox-exporter-dashboards + labels: + grafana_dashboard: "1" + data: + blackbox-exporter-dashboard.json: | + {% raw %} + {{- .Files.Get "grafana-dashboards/blackbox-exporter-dashboard.json" | nindent 12 }} + {% endraw %} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "blackbox-exporter-alerts") }} + labels: {{ include "cluster-addons.componentLabels" (list . "blackbox-exporter-alerts") | nindent 4 }} + annotations: + # Tell Argo to ignore the non-controller owner references for this object + argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.monitoring.blackboxExporter.release.namespace }} + releaseName: blackbox-exporter-alerts + manifestSources: + - template: | + {% raw %} + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + metadata: + name: blackbox-exporter-alerts + spec: + groups: + - name: blackbox_exporter.rules + rules: + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "Blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' failed" + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1 + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "Blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' took more than 1s to complete - {{ "{{" }} $value {{ "}}" }}" + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "Blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' returned an HTTP error status - {{ "{{" }} $value {{ "}}" }}" + - alert: BlackboxSslCertificateWillExpireSoon + expr: (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600) + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "SSL certificate for blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' expires in {{ "{{" }} $value | humanizeDuration {{ "}}" }}" + - alert: BlackboxSslCertificateWillExpireVerySoon + expr: 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600) + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire very soon (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "SSL certificate for blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' expires in {{ "{{" }} $value | humanizeDuration {{ "}}" }}" + - alert: BlackboxSslCertificateExpired + expr: (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0 + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (target {{ "{{" }} $labels.target {{ "}}" }}) + description: "SSL certificate for blackbox probe '{{ "{{" }} $labels.target {{ "}}" }}' has expired" + {% endraw %} +{{- end }} diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 5f414ef..a96402c 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -176,5 +176,7 @@ spec: grafana_dashboard: "1" data: nvidia-dcgm-exporter-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} -{{- end }} \ No newline at end of file + {% endraw %} +{{- end }} diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml index 9d691f6..2f1fc61 100644 --- a/charts/cluster-addons/templates/monitoring/loki-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml @@ -154,9 +154,15 @@ spec: grafana_dashboard: "1" data: loki-pod-logs-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/loki-pod-logs-dashboard.json" | nindent 12 }} + {% endraw %} loki-systemd-logs-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/loki-systemd-logs-dashboard.json" | nindent 12 }} + {% endraw %} loki-metrics-dashboard.json: | + {% raw %} {{- .Files.Get "grafana-dashboards/loki-metrics-dashboard.json" | nindent 12 }} + {% endraw %} {{- end }} diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index b382065..da642c4 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -228,6 +228,21 @@ monitoring: persistence: enabled: true size: 10Gi + # Configuration for the blackbox exporter + blackboxExporter: + enabled: true + chart: + repo: https://prometheus-community.github.io/helm-charts + name: prometheus-blackbox-exporter + version: 8.10.1 + release: + namespace: monitoring-system + values: {} + # Example of adding additional scrape targets + # serviceMonitor: + # targets: + # - name: example + # url: http://example.com/healthz # Settings for node feature discovery # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery diff --git a/skopeo-manifests/prometheus-blackbox-exporter.yaml b/skopeo-manifests/prometheus-blackbox-exporter.yaml new file mode 100644 index 0000000..a440e9f --- /dev/null +++ b/skopeo-manifests/prometheus-blackbox-exporter.yaml @@ -0,0 +1,4 @@ +quay.io: + images: + prometheus/blackbox-exporter: + - v0.24.0