Matt Pryor b88d3ec06a
Add blackbox exporter addon (#241)
* Add blackbox exporter addon

* Fix typo in ingress alerts
2024-02-03 16:41:14 +00:00

183 lines
6.1 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{{- if .Values.monitoring.enabled }}
{{-
$alertmanagerVolumeSize :=
dig
"alertmanager"
"alertmanagerSpec"
"storage"
"volumeClaimTemplate"
"spec"
"resources"
"requests"
"storage"
""
.Values.monitoring.kubePrometheusStack.release.values
}}
{{-
$prometheusVolumeSize :=
dig
"prometheus"
"prometheusSpec"
"storageSpec"
"volumeClaimTemplate"
"spec"
"resources"
"requests"
"storage"
""
.Values.monitoring.kubePrometheusStack.release.values
}}
---
apiVersion: v1
kind: Secret
metadata:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-config
labels:
{{- include "cluster-addons.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }}
addons.stackhpc.com/watch: ""
stringData:
defaults: |
alertmanager:
# Don't apply the namespace grouping by default
config:
route:
group_by: ['...']
alertmanagerSpec:
# Make sure that alertmanager finds configurations with the alertmanager name as a label
alertmanagerConfigSelector:
matchLabels:
alertmanager: kube-prometheus-stack-alertmanager
# Do NOT add the namespace matcher to routes from AlertmanagerConfig resources
alertmanagerConfigMatcherStrategy:
type: None
prometheus:
prometheusSpec:
# Tell Prometheus to pick up all monitors and alerting rules, regardless of labels
podMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
{{- if $prometheusVolumeSize }}
# Set the retention size to 95% of the given volume size
{{- $storageAmount := mustRegexFind "^([0-9]*[.])?[0-9]+" $prometheusVolumeSize | float64 }}
{{- $storageUnits := mustRegexFind "(K|M|G|T|E|P)i?$" $prometheusVolumeSize }}
retentionSize: {{ mulf 0.95 $storageAmount }}{{ $storageUnits }}B
{{- end }}
grafana:
sidecar:
# Tell Grafana to include dashboards from all namespaces
dashboards:
searchNamespace: ALL
overrides: |
{{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }}
---
apiVersion: addons.stackhpc.com/v1alpha1
kind: HelmRelease
metadata:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}
labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }}
annotations:
# Tell Argo to ignore the non-controller owner references for this object
argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true"
spec:
clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true
chart: {{ toYaml .Values.monitoring.kubePrometheusStack.chart | nindent 4 }}
targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
releaseName: kube-prometheus-stack
valuesSources:
- secret:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-config
key: defaults
- secret:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-config
key: overrides
# StatefulSets do not allow their PVCs to be resized, but we can use lifecycle hooks to
# ensure that it happens correctly
lifecycleHooks:
{{- if or $alertmanagerVolumeSize $prometheusVolumeSize }}
preUpgrade:
{{- with $alertmanagerVolumeSize }}
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
alertmanager: kube-prometheus-stack-alertmanager
action: patch
options:
data:
spec:
resources:
requests:
storage: {{ . }}
{{- end }}
{{- with $prometheusVolumeSize }}
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
operator.prometheus.io/name: kube-prometheus-stack-prometheus
action: patch
options:
data:
spec:
resources:
requests:
storage: {{ . }}
{{- end }}
{{- end }}
{{- if or (not $alertmanagerVolumeSize) (not $prometheusVolumeSize) }}
postUpgrade:
{{- if not $alertmanagerVolumeSize }}
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
alertmanager: kube-prometheus-stack-alertmanager
action: delete
{{- end }}
{{- if not $prometheusVolumeSize }}
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
operator.prometheus.io/name: kube-prometheus-stack-prometheus
action: delete
{{- end }}
{{- end }}
# After deleting the release, remove the PVCs
postDelete:
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
alertmanager: kube-prometheus-stack-alertmanager
action: delete
- apiVersion: v1
kind: PersistentVolumeClaim
selector:
operator.prometheus.io/name: kube-prometheus-stack-prometheus
action: delete
---
apiVersion: addons.stackhpc.com/v1alpha1
kind: Manifests
metadata:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }}
labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }}
annotations:
# Tell Argo to ignore the non-controller owner references for this object
argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true"
spec:
clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true
targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
releaseName: kube-prometheus-stack-dashboards
manifestSources:
- template: |
apiVersion: v1
kind: ConfigMap
metadata:
name: additional-grafana-dashboards
labels:
grafana_dashboard: "1"
data:
nvidia-dcgm-exporter-dashboard.json: |
{% raw %}
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
{% endraw %}
{{- end }}