From 244bd313bc1a47d750036d80cc23fe71abe10b59 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 19 Jan 2023 17:01:41 +0000 Subject: [PATCH 01/17] Add service monitor --- charts/cluster-addons/templates/ingress-nginx.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index 3960e17..ed07b5e 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -10,6 +10,15 @@ metadata: stringData: defaults: | controller: + # Indicates whether ingress controller metrics should be included in prometheus + metrics: + # Enable by default if cluster monitoring is enabled + enabled: {{ .Values.monitoring.enabled }} + serviceMonitor: + enabled: {{ .Values.monitoring.enabled }} + namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} # monitoring-system + additionalLabels: + release: kube-prometheus-stack image: registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io admissionWebhooks: From 04f9f7e4b14de36881dc2a4deb3dbaef275f2c53 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 19 Jan 2023 17:02:37 +0000 Subject: [PATCH 02/17] Add alertmanager config --- charts/cluster-addons/values.yaml | 34 +++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 64c6caf..587b3f8 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -163,7 +163,6 @@ ingress: version: 4.4.2 release: namespace: ingress-nginx - values: {} # Settings for cluster monitoring monitoring: @@ -176,7 +175,38 @@ monitoring: version: 43.3.1 release: namespace: monitoring-system - values: {} + values: + # Disable alertmanager by default + alertmanager: + enabled: false + + # Alertmanager does not come with pre-configured alert sinks so we have to + # write our own (and store it in a separate values to keep secrets hidden). + # + # Example config to send alerts to a slack channel: + # + # Note - 'null' receiver must be include as default kube-prometheus-stack + # alerting rules require it. + # If it is not included then alertmanager pods will fail to launch + # and errors will be printed in prometheus operator pod logs. + # + # alertmanager: + # enabled: true + # config: + # global: + # slack_api_url: '' + # route: + # receiver: 'slack-notifications' + # group_by: ['namespace'] + # receivers: + # - name: 'null' + # - name: 'slack-notifications' + # slack_configs: + # - channel: '#' + # send_resolved: true + # - name: 'gmail-notifications' + # TODO: Add example here + lokiStack: enabled: true chart: From b8a89a110bcd9ebb1acea74c3fad846ef8e9c4de Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 19 Jan 2023 17:03:08 +0000 Subject: [PATCH 03/17] Add presistent prometheus storage --- .../templates/monitoring/kube-prometheus-stack.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index fb801c4..cf25c69 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -53,6 +53,17 @@ stringData: imageRenderer: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer + prometheus: + prometheusSpec: + storageSpec: + volumeClaimTemplate: + spec: + # Omit storageClassName field to use default storage class + # storageClassName: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi overrides: | {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }} --- From f1e84c1756e974c309f24f28c73d6991f356e588 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 31 Jan 2023 11:24:12 +0000 Subject: [PATCH 04/17] Fix indentation --- charts/cluster-addons/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 587b3f8..fb8fc81 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -192,7 +192,7 @@ monitoring: # # alertmanager: # enabled: true - # config: + # config: # global: # slack_api_url: '' # route: From 72bb739463e71ffd8083bb2f93b527026d5fe977 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 1 Feb 2023 16:19:45 +0000 Subject: [PATCH 05/17] Fix linting --- charts/cluster-addons/templates/ingress-nginx.yaml | 2 +- charts/cluster-addons/values.yaml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index ed07b5e..ef0e302 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -16,7 +16,7 @@ stringData: enabled: {{ .Values.monitoring.enabled }} serviceMonitor: enabled: {{ .Values.monitoring.enabled }} - namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} # monitoring-system + namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} additionalLabels: release: kube-prometheus-stack image: diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index fb8fc81..043c856 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -181,15 +181,15 @@ monitoring: enabled: false # Alertmanager does not come with pre-configured alert sinks so we have to - # write our own (and store it in a separate values to keep secrets hidden). - # + # write our own (and store it elsewhere to keep credential/secrets hidden). + # # Example config to send alerts to a slack channel: - # - # Note - 'null' receiver must be include as default kube-prometheus-stack + # + # Note - 'null' receiver must be include as default kube-prometheus-stack # alerting rules require it. # If it is not included then alertmanager pods will fail to launch # and errors will be printed in prometheus operator pod logs. - # + # # alertmanager: # enabled: true # config: From c5d7ead6c0fc5eba67f824487c893515911d58d2 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Wed, 1 Feb 2023 16:49:59 +0000 Subject: [PATCH 06/17] Cleanup --- charts/cluster-addons/templates/ingress-nginx.yaml | 3 +-- .../templates/monitoring/kube-prometheus-stack.yaml | 2 +- charts/cluster-addons/values.yaml | 10 ++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index ef0e302..21cec28 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -17,8 +17,7 @@ stringData: serviceMonitor: enabled: {{ .Values.monitoring.enabled }} namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} - additionalLabels: - release: kube-prometheus-stack + additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }} image: registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io admissionWebhooks: diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index cf25c69..30ed439 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -63,7 +63,7 @@ stringData: accessModes: ["ReadWriteOnce"] resources: requests: - storage: 10Gi + storage: {{- .Values.monitoring.prometheusVolumeCapacity }} overrides: | {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }} --- diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 043c856..35a2741 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -168,6 +168,16 @@ ingress: monitoring: # Indicates if the cluster monitoring should be enabled enabled: false + # labels to be added to ServiceMonitor resources + # must match labels from .serviceMonitorSelector.matchLabels + # field of Prometheus resource created by kube-prometheus-stack + # in order for Prometheus to scrape metrics from the services + serviceMonitorLabels: + release: kube-prometheus-stack + # Size of the volume to provision on the target cloud for persistent storage of prometheus data + prometheusVolumeCapacity: 10Gi + # Config for the kube-prometheus-stack helm chart + # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack kubePrometheusStack: chart: repo: https://prometheus-community.github.io/helm-charts From 63626b762d8f7e693d5325a38b6537544f2edf0b Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 2 Feb 2023 15:56:30 +0000 Subject: [PATCH 07/17] Fix template bug --- .../templates/monitoring/kube-prometheus-stack.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 30ed439..5a80b0f 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -63,7 +63,7 @@ stringData: accessModes: ["ReadWriteOnce"] resources: requests: - storage: {{- .Values.monitoring.prometheusVolumeCapacity }} + storage: {{ .Values.monitoring.prometheusVolumeCapacity }} overrides: | {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }} --- From 2c58941dbbc3b76dc94ea4e3651fad4a9badaf8a Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 2 Feb 2023 16:26:53 +0000 Subject: [PATCH 08/17] Add NGINX ingress dashboard --- .../nginx-ingress-dashboard.json | 1265 +++++++++++++++++ .../monitoring/kube-prometheus-stack.yaml | 22 + 2 files changed, 1287 insertions(+) create mode 100644 charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json diff --git a/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json new file mode 100644 index 0000000..a87a323 --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json @@ -0,0 +1,1265 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "enable": true, + "expr": "sum(changes(nginx_ingress_controller_config_last_reload_successful_timestamp_seconds{instance!=\"unknown\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[30s])) by (controller_class)", + "hide": false, + "iconColor": "rgba(255, 96, 96, 1)", + "limit": 100, + "name": "Config Reloads", + "showIn": 0, + "step": "30s", + "tagKeys": "controller_class", + "tags": [], + "titleFormat": "Config Reloaded", + "type": "tags" + } + ] + }, + "description": "Ingress-nginx supports a rich collection of prometheus metrics. If you have prometheus and grafana installed on your cluster then prometheus will already be scraping this data due to the scrape annotation on the deployment.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 9614, + "graphTooltip": 0, + "id": 29, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "id": 20, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Request Volume", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 82, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 95 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 21, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"[4-5].*\"}[2m])) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Success Rate (non-4|5xx responses)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 81, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_success{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Config Reloads", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 83, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count(nginx_ingress_controller_config_last_reload_successful{controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Last Config Failed", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 3 + }, + "height": "200px", + "hiddenSeries": false, + "id": 86, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ ingress }}", + "metric": "network", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Ingress Request Volume", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "reqps", + "logBase": 1, + "show": true + }, + { + "format": "Bps", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00", + "max - prometheus": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 87, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\",status!~\"[4-5].*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ ingress }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Ingress Success Rate (non-4|5xx responses)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 10 + }, + "height": "200px", + "hiddenSeries": false, + "id": 32, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (irate (nginx_ingress_controller_nginx_process_read_bytes_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "- sum (irate (nginx_ingress_controller_response_size_sum{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network I/O pressure", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "Bps", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00", + "max - prometheus": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 10 + }, + "hiddenSeries": false, + "id": 77, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}) ", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "nginx", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Average Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 3, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 10 + }, + "height": "", + "hiddenSeries": false, + "id": 79, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum (rate (nginx_ingress_controller_nginx_process_cpu_seconds_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m])) ", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "nginx", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt" + } + ], + "timeRegions": [], + "title": "Average CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "columns": [ + { + "$$hashKey": "object:336", + "text": "Current", + "value": "current" + } + ], + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fontSize": "100%", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "height": "1024", + "id": 85, + "links": [], + "pageSize": 7, + "scroll": true, + "showHeader": true, + "sort": { + "col": 1, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "TTL", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "pattern": "Current", + "thresholds": [ + "0", + "691200" + ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_ssl_expire_time_seconds{kubernetes_pod_name=~\"$controller\",namespace=~\"$namespace\",ingress=~\"$ingress\"}) by (host) - time()", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ host }}", + "metric": "gke_letsencrypt_cert_expiration", + "refId": "A", + "step": 1 + } + ], + "title": "Ingress Certificate Expiry", + "transform": "timeseries_aggregations", + "type": "table-old" + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "nginx" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash, controller_namespace)", + "refId": "Prometheus-namespace-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Controller Class", + "multi": false, + "name": "controller_class", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\"}, controller_class) ", + "refId": "Prometheus-controller_class-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Controller", + "multi": false, + "name": "controller", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\",controller_class=~\"$controller_class\"}, controller_pod) ", + "refId": "Prometheus-controller-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Ingress", + "multi": false, + "name": "ingress", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\",controller_class=~\"$controller_class\",controller=~\"$controller\"}, ingress) ", + "refId": "Prometheus-ingress-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "2m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "NGINX Ingress controller", + "uid": "nginx", + "version": 1, + "weekStart": "" +} diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 5a80b0f..6727073 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -113,4 +113,26 @@ spec: data: nvidia-dcgm-exporter-dashboard.json: | {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }} + labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + releaseName: kube-prometheus-stack-dashboards + manifestSources: + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: nginx-ingress-dashboard + labels: + grafana_dashboard: "1" + data: + nginx-ingress-dashboard.json: | + {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} {{- end }} From cbbf109041cfc265f387a19d622d6a6c51bba482 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Thu, 2 Feb 2023 16:58:05 +0000 Subject: [PATCH 09/17] Fix dashboard config --- ... nvidia-dcgm-exporter-dashboard_rev3.json} | 559 +++++++++--------- .../monitoring/kube-prometheus-stack.yaml | 24 +- 2 files changed, 281 insertions(+), 302 deletions(-) rename charts/cluster-addons/grafana-dashboards/{nvidia-dcgm-exporter-dashboard_rev2.json => nvidia-dcgm-exporter-dashboard_rev3.json} (72%) diff --git a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json similarity index 72% rename from charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json rename to charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json index 40f89d8..a811c8e 100644 --- a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json +++ b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json @@ -1,68 +1,51 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "6.7.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, - "id": null, - "iteration": 1588401887165, + "id": 33, "links": [], + "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -88,9 +71,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -100,6 +84,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": false, "interval": "", @@ -108,9 +96,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Temperature", "tooltip": { "shared": true, @@ -119,37 +105,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -158,54 +167,30 @@ }, "id": 14, "options": { - "fieldOptions": { + "orientation": "auto", + "reduceOptions": { "calcs": [ "mean" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 83 - }, - { - "color": "red", - "value": 87 - } - ] - }, - "unit": "celsius" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Avg. Temp", "type": "gauge" }, @@ -214,7 +199,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -240,10 +234,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pluginVersion": "6.5.2", + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -253,6 +247,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -260,9 +258,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Power Usage", "tooltip": { "shared": true, @@ -271,38 +267,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "cacheTimeout": null, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -312,55 +330,30 @@ "id": 16, "links": [], "options": { - "fieldOptions": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ "sum" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 2400, - "min": 0, - "nullValueMode": "connected", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1800 - }, - { - "color": "red", - "value": 2200 - } - ] - }, - "unit": "watt" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "horizontal", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Power Total", "type": "gauge" }, @@ -369,7 +362,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -389,7 +391,6 @@ "min": false, "rightSide": true, "show": true, - "sideWidth": null, "total": false, "values": true }, @@ -397,9 +398,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -409,6 +411,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", @@ -418,9 +424,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU SM Clocks", "tooltip": { "shared": true, @@ -429,34 +433,25 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "decimals": null, "format": "hertz", "label": "", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -464,7 +459,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -502,6 +500,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -509,9 +511,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, @@ -520,16 +520,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", - "label": null, "logBase": 1, "max": "100", "min": "0", @@ -537,16 +534,12 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -554,7 +547,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -564,95 +560,6 @@ "y": 32 }, "hiddenSeries": false, - "id": 18, - "legend": { - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "GPU {{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Framebuffer Mem Used", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decmbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, @@ -681,6 +588,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -688,9 +599,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Tensor Core Utilization", "tooltip": { "shared": true, @@ -699,16 +608,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, "max": "1", "min": "0", @@ -716,66 +622,161 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU Framebuffer Mem Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decmbytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } } ], "refresh": false, - "schemaVersion": 22, + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { "list": [ { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "hide": 0, "includeAll": false, - "label": null, "multi": true, "name": "instance", "options": [], - "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "query": { + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refId": "Prometheus-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(gpu)", "hide": 0, "includeAll": true, - "index": -1, - "label": null, "multi": true, "name": "gpu", "options": [], - "query": "label_values(gpu)", + "query": { + "query": "label_values(gpu)", + "refId": "Prometheus-gpu-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -803,8 +804,6 @@ "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", - "variables": { - "list": [] - }, - "version": 1 + "version": 1, + "weekStart": "" } diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 6727073..e166fc0 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -107,32 +107,12 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: nvidia-dcgm-exporter-dashboard + name: additional-grafana-dashboard labels: grafana_dashboard: "1" data: nvidia-dcgm-exporter-dashboard.json: | - {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: Manifests -metadata: - name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }} - labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }} -spec: - clusterName: {{ include "cluster-addons.clusterName" . }} - bootstrap: true - targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} - releaseName: kube-prometheus-stack-dashboards - manifestSources: - - template: | - apiVersion: v1 - kind: ConfigMap - metadata: - name: nginx-ingress-dashboard - labels: - grafana_dashboard: "1" - data: + {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} nginx-ingress-dashboard.json: | {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} {{- end }} From e6c10582e28629ceb43c919daff3fad02ed4123f Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 3 Feb 2023 15:32:34 +0000 Subject: [PATCH 10/17] Only include nginx dashboard when addon is enabled --- .../templates/monitoring/kube-prometheus-stack.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index e166fc0..11771ef 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -113,6 +113,8 @@ spec: data: nvidia-dcgm-exporter-dashboard.json: | {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} + {{- if .Values.ingress.enabled }} nginx-ingress-dashboard.json: | {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} + {{- end }} {{- end }} From 8982679e88849bba6741b020d5683d7d1f371d47 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 7 Feb 2023 12:03:13 +0000 Subject: [PATCH 11/17] Add metrics storage limit to prometheus --- .../templates/monitoring/kube-prometheus-stack.yaml | 1 + charts/cluster-addons/values.yaml | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 11771ef..496ecaf 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -55,6 +55,7 @@ stringData: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer prometheus: prometheusSpec: + retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }} storageSpec: volumeClaimTemplate: spec: diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 35a2741..456cfce 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -176,6 +176,8 @@ monitoring: release: kube-prometheus-stack # Size of the volume to provision on the target cloud for persistent storage of prometheus data prometheusVolumeCapacity: 10Gi + # Should be less than prometheusVolumeCapacity + prometheusMetricsRetentionSize: 9.9GB # Config for the kube-prometheus-stack helm chart # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack kubePrometheusStack: @@ -190,13 +192,14 @@ monitoring: alertmanager: enabled: false + ############################################################################# # Alertmanager does not come with pre-configured alert sinks so we have to # write our own (and store it elsewhere to keep credential/secrets hidden). # # Example config to send alerts to a slack channel: # # Note - 'null' receiver must be include as default kube-prometheus-stack - # alerting rules require it. + # alerting rules (specifically the WatchDog alert) require it. # If it is not included then alertmanager pods will fail to launch # and errors will be printed in prometheus operator pod logs. # @@ -216,6 +219,7 @@ monitoring: # send_resolved: true # - name: 'gmail-notifications' # TODO: Add example here + ############################################################################# lokiStack: enabled: true From 4bc41a74bfb293c1e1f5d2899affcdbb50dc3c5d Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 7 Feb 2023 13:13:35 +0000 Subject: [PATCH 12/17] Remove duplicate values --- .../monitoring/kube-prometheus-stack.yaml | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 496ecaf..c945370 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -29,6 +29,14 @@ stringData: prometheusSpec: image: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io + retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }} + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: {{ .Values.monitoring.prometheusVolumeCapacity }} thanosRuler: thanosRulerSpec: image: @@ -53,18 +61,6 @@ stringData: imageRenderer: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana-image-renderer - prometheus: - prometheusSpec: - retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }} - storageSpec: - volumeClaimTemplate: - spec: - # Omit storageClassName field to use default storage class - # storageClassName: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: {{ .Values.monitoring.prometheusVolumeCapacity }} overrides: | {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }} --- From 87bc0ea8a31eadbf6a9b344a517b99f59efa3932 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 7 Feb 2023 13:16:39 +0000 Subject: [PATCH 13/17] Defer alertmanager.enabled default to KPS --- charts/cluster-addons/values.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 456cfce..3d08080 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -187,10 +187,7 @@ monitoring: version: 43.3.1 release: namespace: monitoring-system - values: - # Disable alertmanager by default - alertmanager: - enabled: false + values: {} ############################################################################# # Alertmanager does not come with pre-configured alert sinks so we have to From 049f7b318c3b8742d1e4fbf6c978cdc656fa144f Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 7 Feb 2023 13:40:36 +0000 Subject: [PATCH 14/17] Ignore .vscode --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e48b0fc..7042f36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ charts/*/charts -Chart.lock \ No newline at end of file +Chart.lock +.vscode \ No newline at end of file From d4f8af3456cb888a55bfb76d33fa0825171e9a0c Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Fri, 10 Feb 2023 14:43:21 +0000 Subject: [PATCH 15/17] Calculate retention size from volume size --- .../templates/monitoring/kube-prometheus-stack.yaml | 4 ++-- charts/cluster-addons/values.yaml | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index c945370..c51de01 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -29,14 +29,14 @@ stringData: prometheusSpec: image: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io - retentionSize: {{ .Values.monitoring.prometheusMetricsRetentionSize }} + retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB storageSpec: volumeClaimTemplate: spec: accessModes: ["ReadWriteOnce"] resources: requests: - storage: {{ .Values.monitoring.prometheusVolumeCapacity }} + storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi thanosRuler: thanosRulerSpec: image: diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 3d08080..714666c 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -174,10 +174,8 @@ monitoring: # in order for Prometheus to scrape metrics from the services serviceMonitorLabels: release: kube-prometheus-stack - # Size of the volume to provision on the target cloud for persistent storage of prometheus data - prometheusVolumeCapacity: 10Gi - # Should be less than prometheusVolumeCapacity - prometheusMetricsRetentionSize: 9.9GB + # Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data + prometheusVolumeCapacity: 10 # Config for the kube-prometheus-stack helm chart # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack kubePrometheusStack: From de01135e01d66d158a5bf78d882324fd51b86ed0 Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Mon, 20 Feb 2023 09:39:47 +0000 Subject: [PATCH 16/17] Update VM flavor names --- charts/openstack-cluster/ci/kube-1-23-ha-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-23-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-24-ha-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-24-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-25-ha-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-25-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-26-ha-values.yaml | 4 ++-- charts/openstack-cluster/ci/kube-1-26-values.yaml | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml index cf5d10b..dcb4b5f 100644 --- a/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-23-ha-values.yaml @@ -4,10 +4,10 @@ kubernetesVersion: 1.23.15 machineImageId: c2f235c1-ad10-4e96-8568-aac864945686 controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-23-values.yaml b/charts/openstack-cluster/ci/kube-1-23-values.yaml index fe2a154..3a232b5 100644 --- a/charts/openstack-cluster/ci/kube-1-23-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-23-values.yaml @@ -7,10 +7,10 @@ apiServer: enableLoadBalancer: false controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml index f56653d..7b8eb0f 100644 --- a/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-24-ha-values.yaml @@ -4,10 +4,10 @@ kubernetesVersion: 1.24.9 machineImageId: ad1405d6-5270-4d5b-b403-a6cba3762f8e controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-24-values.yaml b/charts/openstack-cluster/ci/kube-1-24-values.yaml index 2c53ef0..8674552 100644 --- a/charts/openstack-cluster/ci/kube-1-24-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-24-values.yaml @@ -7,10 +7,10 @@ apiServer: enableLoadBalancer: false controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml index dcb5468..6bc7a43 100644 --- a/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-25-ha-values.yaml @@ -4,10 +4,10 @@ kubernetesVersion: 1.25.4 machineImageId: 48c078a5-fd89-4f61-9d6a-c4f48745c0ae controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-25-values.yaml b/charts/openstack-cluster/ci/kube-1-25-values.yaml index cad07da..3c3dff9 100644 --- a/charts/openstack-cluster/ci/kube-1-25-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-25-values.yaml @@ -7,10 +7,10 @@ apiServer: enableLoadBalancer: false controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml b/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml index d4853f0..27b19b8 100644 --- a/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-26-ha-values.yaml @@ -4,10 +4,10 @@ kubernetesVersion: 1.26.0 machineImageId: 5eae91aa-0c96-472e-ba8c-edd6162281f7 controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 diff --git a/charts/openstack-cluster/ci/kube-1-26-values.yaml b/charts/openstack-cluster/ci/kube-1-26-values.yaml index 6b54f8b..a94759c 100644 --- a/charts/openstack-cluster/ci/kube-1-26-values.yaml +++ b/charts/openstack-cluster/ci/kube-1-26-values.yaml @@ -7,10 +7,10 @@ apiServer: enableLoadBalancer: false controlPlane: - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small machineCount: 1 nodeGroups: - machineCount: 2 - machineFlavor: vm.alaska.cpu.general.small + machineFlavor: vm.ska.cpu.general.small name: test-group1 From 81bc1c430a1c79ccabae2a47a898e72f824e9c7d Mon Sep 17 00:00:00 2001 From: Scott Davidson Date: Tue, 21 Feb 2023 10:45:43 +0000 Subject: [PATCH 17/17] Add loki data persistence --- charts/cluster-addons/templates/monitoring/loki-stack.yaml | 3 +++ charts/cluster-addons/values.yaml | 2 ++ 2 files changed, 5 insertions(+) diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml index 2ae29c2..a28f14d 100644 --- a/charts/cluster-addons/templates/monitoring/loki-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml @@ -12,6 +12,9 @@ stringData: loki: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki + persistence: + enabled: true + size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi promtail: image: registry: {{ include "cluster-addons.imagePrefix" . }}docker.io diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 714666c..bf19acf 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -225,6 +225,8 @@ monitoring: release: namespace: monitoring-system values: {} + # Size of the volume in GB to provision on the target cloud for persistent storage of loki data + lokiVolumeCapacity: 10 # Settings for node feature discovery # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery