From 244bd313bc1a47d750036d80cc23fe71abe10b59 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 19 Jan 2023 17:01:41 +0000
Subject: [PATCH 01/17] Add service monitor

---
 charts/cluster-addons/templates/ingress-nginx.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml
index 3960e17..ed07b5e 100644
--- a/charts/cluster-addons/templates/ingress-nginx.yaml
+++ b/charts/cluster-addons/templates/ingress-nginx.yaml
@@ -10,6 +10,15 @@ metadata:
 stringData:
   defaults: |
     controller:
+      # Indicates whether ingress controller metrics should be included in prometheus
+      metrics:
+        # Enable by default if cluster monitoring is enabled
+        enabled: {{ .Values.monitoring.enabled }}
+        serviceMonitor:
+          enabled: {{ .Values.monitoring.enabled }}
+          namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}  # monitoring-system
+          additionalLabels: 
+            release: kube-prometheus-stack 
       image:
         registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io
     admissionWebhooks:

From 04f9f7e4b14de36881dc2a4deb3dbaef275f2c53 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 19 Jan 2023 17:02:37 +0000
Subject: [PATCH 02/17] Add alertmanager config

---
 charts/cluster-addons/values.yaml | 34 +++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 64c6caf..587b3f8 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -163,7 +163,6 @@ ingress:
       version: 4.4.2
     release:
       namespace: ingress-nginx
-      values: {}
 
 # Settings for cluster monitoring
 monitoring:
@@ -176,7 +175,38 @@ monitoring:
       version: 43.3.1
     release:
       namespace: monitoring-system
-      values: {}
+      values:
+        # Disable alertmanager by default
+        alertmanager:
+          enabled: false
+
+        # Alertmanager does not come with pre-configured alert sinks so we have to
+        # write our own (and store it in a separate values to keep secrets hidden).
+        # 
+        # Example config to send alerts to a slack channel:
+        # 
+        # Note - 'null' receiver must be include as default kube-prometheus-stack 
+        #         alerting rules require it.
+        #         If it is not included then alertmanager pods will fail to launch
+        #         and errors will be printed in prometheus operator pod logs.
+        # 
+        # alertmanager:
+        #   enabled: true
+        #     config:
+        #     global:
+        #       slack_api_url: '<insert-secret-slack-webhook-url>'
+        #     route:
+        #       receiver: 'slack-notifications'
+        #       group_by: ['namespace']
+        #     receivers:
+        #     - name: 'null'
+        #     - name: 'slack-notifications'
+        #       slack_configs:
+        #       - channel: '#<insert-channel-name>'
+        #         send_resolved: true
+        #     - name: 'gmail-notifications'
+        #       TODO: Add example here
+
   lokiStack:
     enabled: true
     chart:

From b8a89a110bcd9ebb1acea74c3fad846ef8e9c4de Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 19 Jan 2023 17:03:08 +0000
Subject: [PATCH 03/17] Add presistent prometheus storage

---
 .../templates/monitoring/kube-prometheus-stack.yaml   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index fb801c4..cf25c69 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -53,6 +53,17 @@ stringData:
       imageRenderer:
         image:
           repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer
+    prometheus:
+      prometheusSpec:
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              # Omit storageClassName field to use default storage class
+              # storageClassName: 
+              accessModes: ["ReadWriteOnce"]
+              resources:
+                requests:
+                  storage: 10Gi
   overrides: |
     {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }}
 ---

From f1e84c1756e974c309f24f28c73d6991f356e588 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 31 Jan 2023 11:24:12 +0000
Subject: [PATCH 04/17] Fix indentation

---
 charts/cluster-addons/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 587b3f8..fb8fc81 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -192,7 +192,7 @@ monitoring:
         # 
         # alertmanager:
         #   enabled: true
-        #     config:
+        #   config:
         #     global:
         #       slack_api_url: '<insert-secret-slack-webhook-url>'
         #     route:

From 72bb739463e71ffd8083bb2f93b527026d5fe977 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Wed, 1 Feb 2023 16:19:45 +0000
Subject: [PATCH 05/17] Fix linting

---
 charts/cluster-addons/templates/ingress-nginx.yaml |  2 +-
 charts/cluster-addons/values.yaml                  | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml
index ed07b5e..ef0e302 100644
--- a/charts/cluster-addons/templates/ingress-nginx.yaml
+++ b/charts/cluster-addons/templates/ingress-nginx.yaml
@@ -16,7 +16,7 @@ stringData:
         enabled: {{ .Values.monitoring.enabled }}
         serviceMonitor:
           enabled: {{ .Values.monitoring.enabled }}
-          namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}  # monitoring-system
+          namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
           additionalLabels: 
             release: kube-prometheus-stack 
       image:
diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index fb8fc81..043c856 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -181,15 +181,15 @@ monitoring:
           enabled: false
 
         # Alertmanager does not come with pre-configured alert sinks so we have to
-        # write our own (and store it in a separate values to keep secrets hidden).
-        # 
+        # write our own (and store it elsewhere to keep credential/secrets hidden).
+        #
         # Example config to send alerts to a slack channel:
-        # 
-        # Note - 'null' receiver must be include as default kube-prometheus-stack 
+        #
+        # Note - 'null' receiver must be include as default kube-prometheus-stack
         #         alerting rules require it.
         #         If it is not included then alertmanager pods will fail to launch
         #         and errors will be printed in prometheus operator pod logs.
-        # 
+        #
         # alertmanager:
         #   enabled: true
         #   config:

From c5d7ead6c0fc5eba67f824487c893515911d58d2 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Wed, 1 Feb 2023 16:49:59 +0000
Subject: [PATCH 06/17] Cleanup

---
 charts/cluster-addons/templates/ingress-nginx.yaml     |  3 +--
 .../templates/monitoring/kube-prometheus-stack.yaml    |  2 +-
 charts/cluster-addons/values.yaml                      | 10 ++++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml
index ef0e302..21cec28 100644
--- a/charts/cluster-addons/templates/ingress-nginx.yaml
+++ b/charts/cluster-addons/templates/ingress-nginx.yaml
@@ -17,8 +17,7 @@ stringData:
         serviceMonitor:
           enabled: {{ .Values.monitoring.enabled }}
           namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
-          additionalLabels: 
-            release: kube-prometheus-stack 
+          additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }}
       image:
         registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io
     admissionWebhooks:
diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index cf25c69..30ed439 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -63,7 +63,7 @@ stringData:
               accessModes: ["ReadWriteOnce"]
               resources:
                 requests:
-                  storage: 10Gi
+                  storage: {{- .Values.monitoring.prometheusVolumeCapacity }}
   overrides: |
     {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }}
 ---
diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 043c856..35a2741 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -168,6 +168,16 @@ ingress:
 monitoring:
   # Indicates if the cluster monitoring should be enabled
   enabled: false
+  # labels to be added to ServiceMonitor resources
+  # must match labels from .serviceMonitorSelector.matchLabels
+  # field of Prometheus resource created by kube-prometheus-stack
+  # in order for Prometheus to scrape metrics from the services
+  serviceMonitorLabels:
+    release: kube-prometheus-stack
+  # Size of the volume to provision on the target cloud for persistent storage of prometheus data
+  prometheusVolumeCapacity: 10Gi
+  # Config for the kube-prometheus-stack helm chart
+  # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack
   kubePrometheusStack:
     chart:
       repo: https://prometheus-community.github.io/helm-charts

From 63626b762d8f7e693d5325a38b6537544f2edf0b Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 2 Feb 2023 15:56:30 +0000
Subject: [PATCH 07/17] Fix template bug

---
 .../templates/monitoring/kube-prometheus-stack.yaml             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index 30ed439..5a80b0f 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -63,7 +63,7 @@ stringData:
               accessModes: ["ReadWriteOnce"]
               resources:
                 requests:
-                  storage: {{- .Values.monitoring.prometheusVolumeCapacity }}
+                  storage: {{ .Values.monitoring.prometheusVolumeCapacity }}
   overrides: |
     {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }}
 ---

From 2c58941dbbc3b76dc94ea4e3651fad4a9badaf8a Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 2 Feb 2023 16:26:53 +0000
Subject: [PATCH 08/17] Add NGINX ingress dashboard

---
 .../nginx-ingress-dashboard.json              | 1265 +++++++++++++++++
 .../monitoring/kube-prometheus-stack.yaml     |   22 +
 2 files changed, 1287 insertions(+)
 create mode 100644 charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json

diff --git a/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json
new file mode 100644
index 0000000..a87a323
--- /dev/null
+++ b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json
@@ -0,0 +1,1265 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "enable": true,
+        "expr": "sum(changes(nginx_ingress_controller_config_last_reload_successful_timestamp_seconds{instance!=\"unknown\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[30s])) by (controller_class)",
+        "hide": false,
+        "iconColor": "rgba(255, 96, 96, 1)",
+        "limit": 100,
+        "name": "Config Reloads",
+        "showIn": 0,
+        "step": "30s",
+        "tagKeys": "controller_class",
+        "tags": [],
+        "titleFormat": "Config Reloaded",
+        "type": "tags"
+      }
+    ]
+  },
+  "description": "Ingress-nginx supports a rich collection of prometheus metrics. If you have prometheus and grafana installed on your cluster then prometheus will already be scraping this data due to the scrape annotation on the deployment.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "gnetId": 9614,
+  "graphTooltip": 0,
+  "id": 29,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ops"
+        },
+        "overrides": []
+      },
+      "id": 20,
+      "links": [],
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "9.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "range": true,
+          "refId": "A",
+          "step": 4
+        }
+      ],
+      "title": "Controller Request Volume",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 6,
+        "y": 0
+      },
+      "id": 82,
+      "links": [],
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "9.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))",
+          "format": "time_series",
+          "instant": false,
+          "intervalFactor": 1,
+          "refId": "A",
+          "step": 4
+        }
+      ],
+      "title": "Controller Connections",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "rgba(245, 54, 54, 0.9)",
+                "value": null
+              },
+              {
+                "color": "rgba(237, 129, 40, 0.89)",
+                "value": 95
+              },
+              {
+                "color": "rgba(50, 172, 45, 0.97)",
+                "value": 99
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "id": 21,
+      "links": [],
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "9.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"[4-5].*\"}[2m])) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "range": true,
+          "refId": "A",
+          "step": 4
+        }
+      ],
+      "title": "Controller Success Rate (non-4|5xx responses)",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "decimals": 0,
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 18,
+        "y": 0
+      },
+      "id": 81,
+      "links": [],
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "9.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "avg(nginx_ingress_controller_success{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"})",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A",
+          "step": 4
+        }
+      ],
+      "title": "Config Reloads",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "fixedColor": "rgb(31, 120, 193)",
+            "mode": "fixed"
+          },
+          "decimals": 0,
+          "mappings": [
+            {
+              "options": {
+                "match": "null",
+                "result": {
+                  "text": "N/A"
+                }
+              },
+              "type": "special"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 21,
+        "y": 0
+      },
+      "id": 83,
+      "links": [],
+      "maxDataPoints": 100,
+      "options": {
+        "colorMode": "none",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "9.3.1",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "count(nginx_ingress_controller_config_last_reload_successful{controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)",
+          "format": "time_series",
+          "instant": true,
+          "intervalFactor": 1,
+          "refId": "A",
+          "step": 4
+        }
+      ],
+      "title": "Last Config Failed",
+      "type": "stat"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "decimals": 2,
+      "editable": true,
+      "error": false,
+      "fill": 1,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 0,
+        "y": 3
+      },
+      "height": "200px",
+      "hiddenSeries": false,
+      "id": 86,
+      "isNew": true,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "hideEmpty": false,
+        "hideZero": true,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "sideWidth": 300,
+        "sort": "current",
+        "sortDesc": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "connected",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "9.3.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "repeatDirection": "h",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)",
+          "format": "time_series",
+          "hide": false,
+          "instant": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ ingress }}",
+          "metric": "network",
+          "refId": "A",
+          "step": 10
+        }
+      ],
+      "thresholds": [],
+      "timeRegions": [],
+      "title": "Ingress Request Volume",
+      "tooltip": {
+        "msResolution": false,
+        "shared": true,
+        "sort": 2,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "Bps",
+          "logBase": 1,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false
+      }
+    },
+    {
+      "aliasColors": {
+        "max - istio-proxy": "#890f02",
+        "max - master": "#bf1b00",
+        "max - prometheus": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "decimals": 2,
+      "editable": false,
+      "error": false,
+      "fill": 0,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 7,
+        "w": 12,
+        "x": 12,
+        "y": 3
+      },
+      "hiddenSeries": false,
+      "id": 87,
+      "isNew": true,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": false,
+        "hideEmpty": true,
+        "hideZero": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "sideWidth": 300,
+        "sort": "avg",
+        "sortDesc": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "connected",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "9.3.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\",status!~\"[4-5].*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)",
+          "format": "time_series",
+          "instant": false,
+          "interval": "10s",
+          "intervalFactor": 1,
+          "legendFormat": "{{ ingress }}",
+          "metric": "container_memory_usage:sort_desc",
+          "refId": "A",
+          "step": 10
+        }
+      ],
+      "thresholds": [],
+      "timeRegions": [],
+      "title": "Ingress Success Rate (non-4|5xx responses)",
+      "tooltip": {
+        "msResolution": false,
+        "shared": true,
+        "sort": 1,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "percentunit",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "short",
+          "logBase": 1,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "decimals": 2,
+      "editable": true,
+      "error": false,
+      "fill": 1,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 0,
+        "y": 10
+      },
+      "height": "200px",
+      "hiddenSeries": false,
+      "id": 32,
+      "isNew": true,
+      "legend": {
+        "alignAsTable": false,
+        "avg": true,
+        "current": true,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": false,
+        "sideWidth": 200,
+        "sort": "current",
+        "sortDesc": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "connected",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "9.3.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "sum (irate (nginx_ingress_controller_nginx_process_read_bytes_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))",
+          "format": "time_series",
+          "instant": false,
+          "interval": "10s",
+          "intervalFactor": 1,
+          "legendFormat": "Received",
+          "metric": "network",
+          "refId": "A",
+          "step": 10
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "- sum (irate (nginx_ingress_controller_response_size_sum{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))",
+          "format": "time_series",
+          "hide": false,
+          "interval": "10s",
+          "intervalFactor": 1,
+          "legendFormat": "Sent",
+          "metric": "network",
+          "refId": "B",
+          "step": 10
+        }
+      ],
+      "thresholds": [],
+      "timeRegions": [],
+      "title": "Network I/O pressure",
+      "tooltip": {
+        "msResolution": false,
+        "shared": true,
+        "sort": 0,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "Bps",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "Bps",
+          "logBase": 1,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false
+      }
+    },
+    {
+      "aliasColors": {
+        "max - istio-proxy": "#890f02",
+        "max - master": "#bf1b00",
+        "max - prometheus": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "decimals": 2,
+      "editable": false,
+      "error": false,
+      "fill": 0,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 8,
+        "y": 10
+      },
+      "hiddenSeries": false,
+      "id": 77,
+      "isNew": true,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": false,
+        "sideWidth": 200,
+        "sort": "current",
+        "sortDesc": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "connected",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "9.3.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}) ",
+          "format": "time_series",
+          "instant": false,
+          "interval": "10s",
+          "intervalFactor": 1,
+          "legendFormat": "nginx",
+          "metric": "container_memory_usage:sort_desc",
+          "refId": "A",
+          "step": 10
+        }
+      ],
+      "thresholds": [],
+      "timeRegions": [],
+      "title": "Average Memory Usage",
+      "tooltip": {
+        "msResolution": false,
+        "shared": true,
+        "sort": 2,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "bytes",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "short",
+          "logBase": 1,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false
+      }
+    },
+    {
+      "aliasColors": {
+        "max - istio-proxy": "#890f02",
+        "max - master": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "decimals": 3,
+      "editable": false,
+      "error": false,
+      "fill": 0,
+      "fillGradient": 0,
+      "grid": {},
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 16,
+        "y": 10
+      },
+      "height": "",
+      "hiddenSeries": false,
+      "id": 79,
+      "isNew": true,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": false,
+        "min": false,
+        "rightSide": false,
+        "show": false,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "connected",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "9.3.1",
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum (rate (nginx_ingress_controller_nginx_process_cpu_seconds_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m])) ",
+          "format": "time_series",
+          "interval": "10s",
+          "intervalFactor": 1,
+          "legendFormat": "nginx",
+          "metric": "container_cpu",
+          "refId": "A",
+          "step": 10
+        }
+      ],
+      "thresholds": [
+        {
+          "colorMode": "critical",
+          "fill": true,
+          "line": true,
+          "op": "gt"
+        }
+      ],
+      "timeRegions": [],
+      "title": "Average CPU Usage",
+      "tooltip": {
+        "msResolution": true,
+        "shared": true,
+        "sort": 2,
+        "value_type": "cumulative"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "none",
+          "label": "cores",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "short",
+          "logBase": 1,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false
+      }
+    },
+    {
+      "columns": [
+        {
+          "$$hashKey": "object:336",
+          "text": "Current",
+          "value": "current"
+        }
+      ],
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fontSize": "100%",
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "height": "1024",
+      "id": 85,
+      "links": [],
+      "pageSize": 7,
+      "scroll": true,
+      "showHeader": true,
+      "sort": {
+        "col": 1,
+        "desc": false
+      },
+      "styles": [
+        {
+          "alias": "Time",
+          "align": "auto",
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "pattern": "Time",
+          "type": "date"
+        },
+        {
+          "alias": "TTL",
+          "align": "auto",
+          "colorMode": "cell",
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "dateFormat": "YYYY-MM-DD HH:mm:ss",
+          "decimals": 0,
+          "pattern": "Current",
+          "thresholds": [
+            "0",
+            "691200"
+          ],
+          "type": "number",
+          "unit": "s"
+        },
+        {
+          "alias": "",
+          "align": "auto",
+          "colors": [
+            "rgba(245, 54, 54, 0.9)",
+            "rgba(237, 129, 40, 0.89)",
+            "rgba(50, 172, 45, 0.97)"
+          ],
+          "decimals": 2,
+          "pattern": "/.*/",
+          "thresholds": [],
+          "type": "number",
+          "unit": "short"
+        }
+      ],
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "avg(nginx_ingress_controller_ssl_expire_time_seconds{kubernetes_pod_name=~\"$controller\",namespace=~\"$namespace\",ingress=~\"$ingress\"}) by (host) - time()",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "{{ host }}",
+          "metric": "gke_letsencrypt_cert_expiration",
+          "refId": "A",
+          "step": 1
+        }
+      ],
+      "title": "Ingress Certificate Expiry",
+      "transform": "timeseries_aggregations",
+      "type": "table-old"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 37,
+  "style": "dark",
+  "tags": [
+    "nginx"
+  ],
+  "templating": {
+    "list": [
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": false,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Namespace",
+        "multi": false,
+        "name": "namespace",
+        "options": [],
+        "query": {
+          "query": "label_values(nginx_ingress_controller_config_hash, controller_namespace)",
+          "refId": "Prometheus-namespace-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": false,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Controller Class",
+        "multi": false,
+        "name": "controller_class",
+        "options": [],
+        "query": {
+          "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\"}, controller_class) ",
+          "refId": "Prometheus-controller_class-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": false,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Controller",
+        "multi": false,
+        "name": "controller",
+        "options": [],
+        "query": {
+          "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\",controller_class=~\"$controller_class\"}, controller_pod) ",
+          "refId": "Prometheus-controller-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": false,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Ingress",
+        "multi": false,
+        "name": "ingress",
+        "options": [],
+        "query": {
+          "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\",controller_class=~\"$controller_class\",controller=~\"$controller\"}, ingress) ",
+          "refId": "Prometheus-ingress-Variable-Query"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 2,
+        "tagValuesQuery": "",
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "2m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "NGINX Ingress controller",
+  "uid": "nginx",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index 5a80b0f..6727073 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -113,4 +113,26 @@ spec:
         data:
           nvidia-dcgm-exporter-dashboard.json: |
             {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }}
+---
+apiVersion: addons.stackhpc.com/v1alpha1
+kind: Manifests
+metadata:
+  name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }}
+  labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }}
+spec:
+  clusterName: {{ include "cluster-addons.clusterName" . }}
+  bootstrap: true
+  targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
+  releaseName: kube-prometheus-stack-dashboards
+  manifestSources:
+    - template: |
+        apiVersion: v1
+        kind: ConfigMap
+        metadata:
+          name: nginx-ingress-dashboard
+          labels:
+            grafana_dashboard: "1"
+        data:
+          nginx-ingress-dashboard.json: |
+            {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
 {{- end }}

From cbbf109041cfc265f387a19d622d6a6c51bba482 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Thu, 2 Feb 2023 16:58:05 +0000
Subject: [PATCH 09/17] Fix dashboard config

---
 ... nvidia-dcgm-exporter-dashboard_rev3.json} | 559 +++++++++---------
 .../monitoring/kube-prometheus-stack.yaml     |  24 +-
 2 files changed, 281 insertions(+), 302 deletions(-)
 rename charts/cluster-addons/grafana-dashboards/{nvidia-dcgm-exporter-dashboard_rev2.json => nvidia-dcgm-exporter-dashboard_rev3.json} (72%)

diff --git a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json
similarity index 72%
rename from charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json
rename to charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json
index 40f89d8..a811c8e 100644
--- a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json
+++ b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json
@@ -1,68 +1,51 @@
 {
-  "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "Prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
-  ],
-  "__requires": [
-    {
-      "type": "panel",
-      "id": "gauge",
-      "name": "Gauge",
-      "version": ""
-    },
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "6.7.3"
-    },
-    {
-      "type": "panel",
-      "id": "graph",
-      "name": "Graph",
-      "version": ""
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    }
-  ],
   "annotations": {
     "list": [
       {
         "$$hashKey": "object:192",
         "builtIn": 1,
-        "datasource": "-- Grafana --",
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
         "enable": true,
         "hide": true,
         "iconColor": "rgba(0, 211, 255, 1)",
         "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
         "type": "dashboard"
       }
     ]
   },
   "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
   "editable": true,
+  "fiscalYearStartMonth": 0,
   "gnetId": 12239,
   "graphTooltip": 0,
-  "id": null,
-  "iteration": 1588401887165,
+  "id": 33,
   "links": [],
+  "liveNow": false,
   "panels": [
     {
       "aliasColors": {},
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "links": []
+        },
+        "overrides": []
+      },
       "fill": 1,
       "fillGradient": 0,
       "gridPos": {
@@ -88,9 +71,10 @@
       "linewidth": 2,
       "nullPointMode": "null",
       "options": {
-        "dataLinks": []
+        "alertThreshold": true
       },
       "percentage": false,
+      "pluginVersion": "9.3.1",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -100,6 +84,10 @@
       "steppedLine": false,
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
           "instant": false,
           "interval": "",
@@ -108,9 +96,7 @@
         }
       ],
       "thresholds": [],
-      "timeFrom": null,
       "timeRegions": [],
-      "timeShift": null,
       "title": "GPU Temperature",
       "tooltip": {
         "shared": true,
@@ -119,37 +105,60 @@
       },
       "type": "graph",
       "xaxis": {
-        "buckets": null,
         "mode": "time",
-        "name": null,
         "show": true,
         "values": []
       },
       "yaxes": [
         {
           "format": "celsius",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         },
         {
           "format": "short",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         }
       ],
       "yaxis": {
-        "align": false,
-        "alignLevel": null
+        "align": false
       }
     },
     {
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "#EAB839",
+                "value": 83
+              },
+              {
+                "color": "red",
+                "value": 87
+              }
+            ]
+          },
+          "unit": "celsius"
+        },
+        "overrides": []
+      },
       "gridPos": {
         "h": 8,
         "w": 6,
@@ -158,54 +167,30 @@
       },
       "id": 14,
       "options": {
-        "fieldOptions": {
+        "orientation": "auto",
+        "reduceOptions": {
           "calcs": [
             "mean"
           ],
-          "defaults": {
-            "color": {
-              "mode": "thresholds"
-            },
-            "mappings": [],
-            "max": 100,
-            "min": 0,
-            "thresholds": {
-              "mode": "absolute",
-              "steps": [
-                {
-                  "color": "green",
-                  "value": null
-                },
-                {
-                  "color": "#EAB839",
-                  "value": 83
-                },
-                {
-                  "color": "red",
-                  "value": 87
-                }
-              ]
-            },
-            "unit": "celsius"
-          },
-          "overrides": [],
+          "fields": "",
           "values": false
         },
-        "orientation": "auto",
         "showThresholdLabels": false,
         "showThresholdMarkers": true
       },
-      "pluginVersion": "6.7.3",
+      "pluginVersion": "9.3.1",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
           "interval": "",
           "legendFormat": "",
           "refId": "A"
         }
       ],
-      "timeFrom": null,
-      "timeShift": null,
       "title": "GPU Avg. Temp",
       "type": "gauge"
     },
@@ -214,7 +199,16 @@
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "links": []
+        },
+        "overrides": []
+      },
       "fill": 1,
       "fillGradient": 0,
       "gridPos": {
@@ -240,10 +234,10 @@
       "linewidth": 2,
       "nullPointMode": "null",
       "options": {
-        "dataLinks": []
+        "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "6.5.2",
+      "pluginVersion": "9.3.1",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -253,6 +247,10 @@
       "steppedLine": false,
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
           "interval": "",
           "legendFormat": "GPU {{gpu}}",
@@ -260,9 +258,7 @@
         }
       ],
       "thresholds": [],
-      "timeFrom": null,
       "timeRegions": [],
-      "timeShift": null,
       "title": "GPU Power Usage",
       "tooltip": {
         "shared": true,
@@ -271,38 +267,60 @@
       },
       "type": "graph",
       "xaxis": {
-        "buckets": null,
         "mode": "time",
-        "name": null,
         "show": true,
         "values": []
       },
       "yaxes": [
         {
           "format": "watt",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         },
         {
           "format": "short",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         }
       ],
       "yaxis": {
-        "align": false,
-        "alignLevel": null
+        "align": false
       }
     },
     {
-      "cacheTimeout": null,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "max": 2400,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "#EAB839",
+                "value": 1800
+              },
+              {
+                "color": "red",
+                "value": 2200
+              }
+            ]
+          },
+          "unit": "watt"
+        },
+        "overrides": []
+      },
       "gridPos": {
         "h": 8,
         "w": 6,
@@ -312,55 +330,30 @@
       "id": 16,
       "links": [],
       "options": {
-        "fieldOptions": {
+        "orientation": "horizontal",
+        "reduceOptions": {
           "calcs": [
             "sum"
           ],
-          "defaults": {
-            "color": {
-              "mode": "thresholds"
-            },
-            "mappings": [],
-            "max": 2400,
-            "min": 0,
-            "nullValueMode": "connected",
-            "thresholds": {
-              "mode": "absolute",
-              "steps": [
-                {
-                  "color": "green",
-                  "value": null
-                },
-                {
-                  "color": "#EAB839",
-                  "value": 1800
-                },
-                {
-                  "color": "red",
-                  "value": 2200
-                }
-              ]
-            },
-            "unit": "watt"
-          },
-          "overrides": [],
+          "fields": "",
           "values": false
         },
-        "orientation": "horizontal",
         "showThresholdLabels": false,
         "showThresholdMarkers": true
       },
-      "pluginVersion": "6.7.3",
+      "pluginVersion": "9.3.1",
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
           "interval": "",
           "legendFormat": "",
           "refId": "A"
         }
       ],
-      "timeFrom": null,
-      "timeShift": null,
       "title": "GPU Power Total",
       "type": "gauge"
     },
@@ -369,7 +362,16 @@
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "links": []
+        },
+        "overrides": []
+      },
       "fill": 1,
       "fillGradient": 0,
       "gridPos": {
@@ -389,7 +391,6 @@
         "min": false,
         "rightSide": true,
         "show": true,
-        "sideWidth": null,
         "total": false,
         "values": true
       },
@@ -397,9 +398,10 @@
       "linewidth": 2,
       "nullPointMode": "null",
       "options": {
-        "dataLinks": []
+        "alertThreshold": true
       },
       "percentage": false,
+      "pluginVersion": "9.3.1",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -409,6 +411,10 @@
       "steppedLine": false,
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000",
           "format": "time_series",
           "interval": "",
@@ -418,9 +424,7 @@
         }
       ],
       "thresholds": [],
-      "timeFrom": null,
       "timeRegions": [],
-      "timeShift": null,
       "title": "GPU SM Clocks",
       "tooltip": {
         "shared": true,
@@ -429,34 +433,25 @@
       },
       "type": "graph",
       "xaxis": {
-        "buckets": null,
         "mode": "time",
-        "name": null,
         "show": true,
         "values": []
       },
       "yaxes": [
         {
-          "decimals": null,
           "format": "hertz",
           "label": "",
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         },
         {
           "format": "short",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         }
       ],
       "yaxis": {
-        "align": false,
-        "alignLevel": null
+        "align": false
       }
     },
     {
@@ -464,7 +459,10 @@
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
       "fill": 1,
       "fillGradient": 0,
       "gridPos": {
@@ -502,6 +500,10 @@
       "steppedLine": false,
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
           "interval": "",
           "legendFormat": "GPU {{gpu}}",
@@ -509,9 +511,7 @@
         }
       ],
       "thresholds": [],
-      "timeFrom": null,
       "timeRegions": [],
-      "timeShift": null,
       "title": "GPU Utilization",
       "tooltip": {
         "shared": true,
@@ -520,16 +520,13 @@
       },
       "type": "graph",
       "xaxis": {
-        "buckets": null,
         "mode": "time",
-        "name": null,
         "show": true,
         "values": []
       },
       "yaxes": [
         {
           "format": "percent",
-          "label": null,
           "logBase": 1,
           "max": "100",
           "min": "0",
@@ -537,16 +534,12 @@
         },
         {
           "format": "short",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         }
       ],
       "yaxis": {
-        "align": false,
-        "alignLevel": null
+        "align": false
       }
     },
     {
@@ -554,7 +547,10 @@
       "bars": false,
       "dashLength": 10,
       "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
       "fill": 1,
       "fillGradient": 0,
       "gridPos": {
@@ -564,95 +560,6 @@
         "y": 32
       },
       "hiddenSeries": false,
-      "id": 18,
-      "legend": {
-        "avg": true,
-        "current": false,
-        "max": true,
-        "min": false,
-        "rightSide": true,
-        "show": true,
-        "total": false,
-        "values": true
-      },
-      "lines": true,
-      "linewidth": 2,
-      "nullPointMode": "null",
-      "options": {
-        "dataLinks": []
-      },
-      "percentage": false,
-      "pointradius": 2,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
-          "interval": "",
-          "legendFormat": "GPU {{gpu}}",
-          "refId": "A"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "GPU Framebuffer Mem Used",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "decmbytes",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
-    {
-      "aliasColors": {},
-      "bars": false,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "${DS_PROMETHEUS}",
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 24
-      },
-      "hiddenSeries": false,
       "id": 4,
       "legend": {
         "alignAsTable": true,
@@ -681,6 +588,10 @@
       "steppedLine": false,
       "targets": [
         {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
           "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
           "interval": "",
           "legendFormat": "GPU {{gpu}}",
@@ -688,9 +599,7 @@
         }
       ],
       "thresholds": [],
-      "timeFrom": null,
       "timeRegions": [],
-      "timeShift": null,
       "title": "Tensor Core Utilization",
       "tooltip": {
         "shared": true,
@@ -699,16 +608,13 @@
       },
       "type": "graph",
       "xaxis": {
-        "buckets": null,
         "mode": "time",
-        "name": null,
         "show": true,
         "values": []
       },
       "yaxes": [
         {
           "format": "percentunit",
-          "label": null,
           "logBase": 1,
           "max": "1",
           "min": "0",
@@ -716,66 +622,161 @@
         },
         {
           "format": "short",
-          "label": null,
           "logBase": 1,
-          "max": null,
-          "min": null,
           "show": true
         }
       ],
       "yaxis": {
-        "align": false,
-        "alignLevel": null
+        "align": false
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 40
+      },
+      "hiddenSeries": false,
+      "id": 18,
+      "legend": {
+        "avg": true,
+        "current": false,
+        "max": true,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
+          "interval": "",
+          "legendFormat": "GPU {{gpu}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeRegions": [],
+      "title": "GPU Framebuffer Mem Used",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "mode": "time",
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "decmbytes",
+          "logBase": 1,
+          "show": true
+        },
+        {
+          "format": "short",
+          "logBase": 1,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false
       }
     }
   ],
   "refresh": false,
-  "schemaVersion": 22,
+  "schemaVersion": 37,
   "style": "dark",
   "tags": [],
   "templating": {
     "list": [
       {
-        "allValue": null,
-        "current": {},
-        "datasource": "${DS_PROMETHEUS}",
+        "current": {
+          "isNone": true,
+          "selected": false,
+          "text": "None",
+          "value": ""
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
         "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
         "hide": 0,
         "includeAll": false,
-        "label": null,
         "multi": true,
         "name": "instance",
         "options": [],
-        "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
+        "query": {
+          "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
+          "refId": "Prometheus-instance-Variable-Query"
+        },
         "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
         "tagValuesQuery": "",
-        "tags": [],
         "tagsQuery": "",
         "type": "query",
         "useTags": false
       },
       {
-        "allValue": null,
-        "current": {},
-        "datasource": "${DS_PROMETHEUS}",
+        "current": {
+          "selected": false,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
         "definition": "label_values(gpu)",
         "hide": 0,
         "includeAll": true,
-        "index": -1,
-        "label": null,
         "multi": true,
         "name": "gpu",
         "options": [],
-        "query": "label_values(gpu)",
+        "query": {
+          "query": "label_values(gpu)",
+          "refId": "Prometheus-gpu-Variable-Query"
+        },
         "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 1,
         "tagValuesQuery": "",
-        "tags": [],
         "tagsQuery": "",
         "type": "query",
         "useTags": false
@@ -803,8 +804,6 @@
   "timezone": "",
   "title": "NVIDIA DCGM Exporter Dashboard",
   "uid": "Oxed_c6Wz",
-  "variables": {
-    "list": []
-  },
-  "version": 1
+  "version": 1,
+  "weekStart": ""
 }
diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index 6727073..e166fc0 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -107,32 +107,12 @@ spec:
         apiVersion: v1
         kind: ConfigMap
         metadata:
-          name: nvidia-dcgm-exporter-dashboard
+          name: additional-grafana-dashboard
           labels:
             grafana_dashboard: "1"
         data:
           nvidia-dcgm-exporter-dashboard.json: |
-            {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }}
----
-apiVersion: addons.stackhpc.com/v1alpha1
-kind: Manifests
-metadata:
-  name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }}
-  labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }}
-spec:
-  clusterName: {{ include "cluster-addons.clusterName" . }}
-  bootstrap: true
-  targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
-  releaseName: kube-prometheus-stack-dashboards
-  manifestSources:
-    - template: |
-        apiVersion: v1
-        kind: ConfigMap
-        metadata:
-          name: nginx-ingress-dashboard
-          labels:
-            grafana_dashboard: "1"
-        data:
+            {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
           nginx-ingress-dashboard.json: |
             {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
 {{- end }}

From e6c10582e28629ceb43c919daff3fad02ed4123f Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Fri, 3 Feb 2023 15:32:34 +0000
Subject: [PATCH 10/17] Only include nginx dashboard when addon is enabled

---
 .../templates/monitoring/kube-prometheus-stack.yaml             | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index e166fc0..11771ef 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -113,6 +113,8 @@ spec:
         data:
           nvidia-dcgm-exporter-dashboard.json: |
             {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
+          {{- if .Values.ingress.enabled }}
           nginx-ingress-dashboard.json: |
             {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
+          {{- end }}
 {{- end }}

From 8982679e88849bba6741b020d5683d7d1f371d47 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 7 Feb 2023 12:03:13 +0000
Subject: [PATCH 11/17] Add metrics storage limit to prometheus

---
 .../templates/monitoring/kube-prometheus-stack.yaml         | 1 +
 charts/cluster-addons/values.yaml                           | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index 11771ef..496ecaf 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -55,6 +55,7 @@ stringData:
           repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer
     prometheus:
       prometheusSpec:
+        retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }}
         storageSpec:
           volumeClaimTemplate:
             spec:
diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 35a2741..456cfce 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -176,6 +176,8 @@ monitoring:
     release: kube-prometheus-stack
   # Size of the volume to provision on the target cloud for persistent storage of prometheus data
   prometheusVolumeCapacity: 10Gi
+  # Should be less than prometheusVolumeCapacity
+  prometheusMetricsRetentionSize: 9.9GB
   # Config for the kube-prometheus-stack helm chart
   # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack
   kubePrometheusStack:
@@ -190,13 +192,14 @@ monitoring:
         alertmanager:
           enabled: false
 
+        #############################################################################
         # Alertmanager does not come with pre-configured alert sinks so we have to
         # write our own (and store it elsewhere to keep credential/secrets hidden).
         #
         # Example config to send alerts to a slack channel:
         #
         # Note - 'null' receiver must be include as default kube-prometheus-stack
-        #         alerting rules require it.
+        #         alerting rules (specifically the WatchDog alert) require it.
         #         If it is not included then alertmanager pods will fail to launch
         #         and errors will be printed in prometheus operator pod logs.
         #
@@ -216,6 +219,7 @@ monitoring:
         #         send_resolved: true
         #     - name: 'gmail-notifications'
         #       TODO: Add example here
+        #############################################################################
 
   lokiStack:
     enabled: true

From 4bc41a74bfb293c1e1f5d2899affcdbb50dc3c5d Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 7 Feb 2023 13:13:35 +0000
Subject: [PATCH 12/17] Remove duplicate values

---
 .../monitoring/kube-prometheus-stack.yaml     | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index 496ecaf..c945370 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -29,6 +29,14 @@ stringData:
       prometheusSpec:
         image:
           registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
+        retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }}
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              accessModes: ["ReadWriteOnce"]
+              resources:
+                requests:
+                  storage: {{ .Values.monitoring.prometheusVolumeCapacity }}
     thanosRuler:
       thanosRulerSpec:
         image:
@@ -53,18 +61,6 @@ stringData:
       imageRenderer:
         image:
           repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer
-    prometheus:
-      prometheusSpec:
-        retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }}
-        storageSpec:
-          volumeClaimTemplate:
-            spec:
-              # Omit storageClassName field to use default storage class
-              # storageClassName: 
-              accessModes: ["ReadWriteOnce"]
-              resources:
-                requests:
-                  storage: {{ .Values.monitoring.prometheusVolumeCapacity }}
   overrides: |
     {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }}
 ---

From 87bc0ea8a31eadbf6a9b344a517b99f59efa3932 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 7 Feb 2023 13:16:39 +0000
Subject: [PATCH 13/17] Defer alertmanager.enabled default to KPS

---
 charts/cluster-addons/values.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 456cfce..3d08080 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -187,10 +187,7 @@ monitoring:
       version: 43.3.1
     release:
       namespace: monitoring-system
-      values:
-        # Disable alertmanager by default
-        alertmanager:
-          enabled: false
+      values: {}
 
         #############################################################################
         # Alertmanager does not come with pre-configured alert sinks so we have to

From 049f7b318c3b8742d1e4fbf6c978cdc656fa144f Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 7 Feb 2023 13:40:36 +0000
Subject: [PATCH 14/17] Ignore .vscode

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index e48b0fc..7042f36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 charts/*/charts
-Chart.lock
\ No newline at end of file
+Chart.lock
+.vscode
\ No newline at end of file

From d4f8af3456cb888a55bfb76d33fa0825171e9a0c Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Fri, 10 Feb 2023 14:43:21 +0000
Subject: [PATCH 15/17] Calculate retention size from volume size

---
 .../templates/monitoring/kube-prometheus-stack.yaml         | 4 ++--
 charts/cluster-addons/values.yaml                           | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
index c945370..c51de01 100644
--- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml
@@ -29,14 +29,14 @@ stringData:
       prometheusSpec:
         image:
           registry: {{ include "cluster-addons.imagePrefix" . }}quay.io
-        retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }}
+        retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB
         storageSpec:
           volumeClaimTemplate:
             spec:
               accessModes: ["ReadWriteOnce"]
               resources:
                 requests:
-                  storage: {{ .Values.monitoring.prometheusVolumeCapacity }}
+                  storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi
     thanosRuler:
       thanosRulerSpec:
         image:
diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 3d08080..714666c 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -174,10 +174,8 @@ monitoring:
   # in order for Prometheus to scrape metrics from the services
   serviceMonitorLabels:
     release: kube-prometheus-stack
-  # Size of the volume to provision on the target cloud for persistent storage of prometheus data
-  prometheusVolumeCapacity: 10Gi
-  # Should be less than prometheusVolumeCapacity
-  prometheusMetricsRetentionSize: 9.9GB
+  # Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data
+  prometheusVolumeCapacity: 10
   # Config for the kube-prometheus-stack helm chart
   # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack
   kubePrometheusStack:

From de01135e01d66d158a5bf78d882324fd51b86ed0 Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Mon, 20 Feb 2023 09:39:47 +0000
Subject: [PATCH 16/17] Update VM flavor names

---
 charts/openstack-cluster/ci/kube-1-23-ha-values.yaml | 4 ++--
 charts/openstack-cluster/ci/kube-1-23-values.yaml    | 4 ++--
 charts/openstack-cluster/ci/kube-1-24-ha-values.yaml | 4 ++--
 charts/openstack-cluster/ci/kube-1-24-values.yaml    | 4 ++--
 charts/openstack-cluster/ci/kube-1-25-ha-values.yaml | 4 ++--
 charts/openstack-cluster/ci/kube-1-25-values.yaml    | 4 ++--
 charts/openstack-cluster/ci/kube-1-26-ha-values.yaml | 4 ++--
 charts/openstack-cluster/ci/kube-1-26-values.yaml    | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml
index cf5d10b..dcb4b5f 100644
--- a/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml
@@ -4,10 +4,10 @@ kubernetesVersion: 1.23.15
 machineImageId: c2f235c1-ad10-4e96-8568-aac864945686
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-23-values.yaml b/charts/openstack-cluster/ci/kube-1-23-values.yaml
index fe2a154..3a232b5 100644
--- a/charts/openstack-cluster/ci/kube-1-23-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-23-values.yaml
@@ -7,10 +7,10 @@ apiServer:
   enableLoadBalancer: false
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml
index f56653d..7b8eb0f 100644
--- a/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml
@@ -4,10 +4,10 @@ kubernetesVersion: 1.24.9
 machineImageId: ad1405d6-5270-4d5b-b403-a6cba3762f8e
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-24-values.yaml b/charts/openstack-cluster/ci/kube-1-24-values.yaml
index 2c53ef0..8674552 100644
--- a/charts/openstack-cluster/ci/kube-1-24-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-24-values.yaml
@@ -7,10 +7,10 @@ apiServer:
   enableLoadBalancer: false
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml
index dcb5468..6bc7a43 100644
--- a/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml
@@ -4,10 +4,10 @@ kubernetesVersion: 1.25.4
 machineImageId: 48c078a5-fd89-4f61-9d6a-c4f48745c0ae
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-25-values.yaml b/charts/openstack-cluster/ci/kube-1-25-values.yaml
index cad07da..3c3dff9 100644
--- a/charts/openstack-cluster/ci/kube-1-25-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-25-values.yaml
@@ -7,10 +7,10 @@ apiServer:
   enableLoadBalancer: false
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml
index d4853f0..27b19b8 100644
--- a/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml
@@ -4,10 +4,10 @@ kubernetesVersion: 1.26.0
 machineImageId: 5eae91aa-0c96-472e-ba8c-edd6162281f7
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1
diff --git a/charts/openstack-cluster/ci/kube-1-26-values.yaml b/charts/openstack-cluster/ci/kube-1-26-values.yaml
index 6b54f8b..a94759c 100644
--- a/charts/openstack-cluster/ci/kube-1-26-values.yaml
+++ b/charts/openstack-cluster/ci/kube-1-26-values.yaml
@@ -7,10 +7,10 @@ apiServer:
   enableLoadBalancer: false
 
 controlPlane:
-  machineFlavor: vm.alaska.cpu.general.small
+  machineFlavor: vm.ska.cpu.general.small
   machineCount: 1
 
 nodeGroups:
   - machineCount: 2
-    machineFlavor: vm.alaska.cpu.general.small
+    machineFlavor: vm.ska.cpu.general.small
     name: test-group1

From 81bc1c430a1c79ccabae2a47a898e72f824e9c7d Mon Sep 17 00:00:00 2001
From: Scott Davidson <scott@stackhpc.com>
Date: Tue, 21 Feb 2023 10:45:43 +0000
Subject: [PATCH 17/17] Add loki data persistence

---
 charts/cluster-addons/templates/monitoring/loki-stack.yaml | 3 +++
 charts/cluster-addons/values.yaml                          | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml
index 2ae29c2..a28f14d 100644
--- a/charts/cluster-addons/templates/monitoring/loki-stack.yaml
+++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml
@@ -12,6 +12,9 @@ stringData:
     loki:
       image:
         repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki
+      persistence:
+        enabled: true
+        size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi
     promtail:
       image:
         registry: {{ include "cluster-addons.imagePrefix" . }}docker.io
diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml
index 714666c..bf19acf 100644
--- a/charts/cluster-addons/values.yaml
+++ b/charts/cluster-addons/values.yaml
@@ -225,6 +225,8 @@ monitoring:
     release:
       namespace: monitoring-system
       values: {}
+  # Size of the volume in GB to provision on the target cloud for persistent storage of loki data
+  lokiVolumeCapacity: 10
 
 # Settings for node feature discovery
 # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery