diff --git a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json similarity index 72% rename from charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json rename to charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json index 40f89d8..a811c8e 100644 --- a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json +++ b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json @@ -1,68 +1,51 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "6.7.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, - "id": null, - "iteration": 1588401887165, + "id": 33, "links": [], + "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -88,9 +71,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -100,6 +84,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": false, "interval": "", @@ -108,9 +96,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Temperature", "tooltip": { "shared": true, @@ -119,37 +105,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -158,54 +167,30 @@ }, "id": 14, "options": { - "fieldOptions": { + "orientation": "auto", + "reduceOptions": { "calcs": [ "mean" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 83 - }, - { - "color": "red", - "value": 87 - } - ] - }, - "unit": "celsius" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Avg. Temp", "type": "gauge" }, @@ -214,7 +199,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -240,10 +234,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pluginVersion": "6.5.2", + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -253,6 +247,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -260,9 +258,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Power Usage", "tooltip": { "shared": true, @@ -271,38 +267,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "cacheTimeout": null, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -312,55 +330,30 @@ "id": 16, "links": [], "options": { - "fieldOptions": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ "sum" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 2400, - "min": 0, - "nullValueMode": "connected", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1800 - }, - { - "color": "red", - "value": 2200 - } - ] - }, - "unit": "watt" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "horizontal", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Power Total", "type": "gauge" }, @@ -369,7 +362,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -389,7 +391,6 @@ "min": false, "rightSide": true, "show": true, - "sideWidth": null, "total": false, "values": true }, @@ -397,9 +398,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -409,6 +411,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", @@ -418,9 +424,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU SM Clocks", "tooltip": { "shared": true, @@ -429,34 +433,25 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "decimals": null, "format": "hertz", "label": "", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -464,7 +459,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -502,6 +500,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -509,9 +511,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, @@ -520,16 +520,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", - "label": null, "logBase": 1, "max": "100", "min": "0", @@ -537,16 +534,12 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -554,7 +547,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -564,95 +560,6 @@ "y": 32 }, "hiddenSeries": false, - "id": 18, - "legend": { - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "GPU {{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Framebuffer Mem Used", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decmbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, @@ -681,6 +588,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -688,9 +599,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Tensor Core Utilization", "tooltip": { "shared": true, @@ -699,16 +608,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, "max": "1", "min": "0", @@ -716,66 +622,161 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU Framebuffer Mem Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decmbytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } } ], "refresh": false, - "schemaVersion": 22, + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { "list": [ { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "hide": 0, "includeAll": false, - "label": null, "multi": true, "name": "instance", "options": [], - "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "query": { + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refId": "Prometheus-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(gpu)", "hide": 0, "includeAll": true, - "index": -1, - "label": null, "multi": true, "name": "gpu", "options": [], - "query": "label_values(gpu)", + "query": { + "query": "label_values(gpu)", + "refId": "Prometheus-gpu-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -803,8 +804,6 @@ "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", - "variables": { - "list": [] - }, - "version": 1 + "version": 1, + "weekStart": "" } diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index 6727073..e166fc0 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -107,32 +107,12 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: nvidia-dcgm-exporter-dashboard + name: additional-grafana-dashboard labels: grafana_dashboard: "1" data: nvidia-dcgm-exporter-dashboard.json: | - {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: Manifests -metadata: - name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }} - labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }} -spec: - clusterName: {{ include "cluster-addons.clusterName" . }} - bootstrap: true - targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} - releaseName: kube-prometheus-stack-dashboards - manifestSources: - - template: | - apiVersion: v1 - kind: ConfigMap - metadata: - name: nginx-ingress-dashboard - labels: - grafana_dashboard: "1" - data: + {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} nginx-ingress-dashboard.json: | {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} {{- end }}