Fix dashboard config

This commit is contained in:
Scott Davidson 2023-02-02 16:58:05 +00:00
parent 2c58941dbb
commit cbbf109041
2 changed files with 281 additions and 302 deletions

View File

@ -1,68 +1,51 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
},
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.7.3"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"$$hashKey": "object:192",
"builtIn": 1,
"datasource": "-- Grafana --",
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 12239,
"graphTooltip": 0,
"id": null,
"iteration": 1588401887165,
"id": 33,
"links": [],
"liveNow": false,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@ -88,9 +71,10 @@
"linewidth": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.3.1",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -100,6 +84,10 @@
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"instant": false,
"interval": "",
@ -108,9 +96,7 @@
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Temperature",
"tooltip": {
"shared": true,
@ -119,37 +105,60 @@
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "celsius",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
"align": false
}
},
{
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 83
},
{
"color": "red",
"value": 87
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
@ -158,54 +167,30 @@
},
"id": 14,
"options": {
"fieldOptions": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 83
},
{
"color": "red",
"value": 87
}
]
},
"unit": "celsius"
},
"overrides": [],
"fields": "",
"values": false
},
"orientation": "auto",
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "6.7.3",
"pluginVersion": "9.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "GPU Avg. Temp",
"type": "gauge"
},
@ -214,7 +199,16 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@ -240,10 +234,10 @@
"linewidth": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "6.5.2",
"pluginVersion": "9.3.1",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -253,6 +247,10 @@
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
@ -260,9 +258,7 @@
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Power Usage",
"tooltip": {
"shared": true,
@ -271,38 +267,60 @@
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "watt",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
"align": false
}
},
{
"cacheTimeout": null,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
@ -312,55 +330,30 @@
"id": 16,
"links": [],
"options": {
"fieldOptions": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"sum"
],
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": [],
"fields": "",
"values": false
},
"orientation": "horizontal",
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "6.7.3",
"pluginVersion": "9.3.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "GPU Power Total",
"type": "gauge"
},
@ -369,7 +362,16 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@ -389,7 +391,6 @@
"min": false,
"rightSide": true,
"show": true,
"sideWidth": null,
"total": false,
"values": true
},
@ -397,9 +398,10 @@
"linewidth": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.3.1",
"pointradius": 2,
"points": false,
"renderer": "flot",
@ -409,6 +411,10 @@
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000",
"format": "time_series",
"interval": "",
@ -418,9 +424,7 @@
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU SM Clocks",
"tooltip": {
"shared": true,
@ -429,34 +433,25 @@
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "hertz",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
"align": false
}
},
{
@ -464,7 +459,10 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@ -502,6 +500,10 @@
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
@ -509,9 +511,7 @@
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Utilization",
"tooltip": {
"shared": true,
@ -520,16 +520,13 @@
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": "100",
"min": "0",
@ -537,16 +534,12 @@
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
"align": false
}
},
{
@ -554,7 +547,10 @@
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
@ -564,95 +560,6 @@
"y": 32
},
"hiddenSeries": false,
"id": 18,
"legend": {
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Framebuffer Mem Used",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "decmbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
@ -681,6 +588,10 @@
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
@ -688,9 +599,7 @@
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Tensor Core Utilization",
"tooltip": {
"shared": true,
@ -699,16 +608,13 @@
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": "1",
"min": "0",
@ -716,66 +622,161 @@
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 40
},
"hiddenSeries": false,
"id": 18,
"legend": {
"avg": true,
"current": false,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "GPU Framebuffer Mem Used",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "decmbytes",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
}
],
"refresh": false,
"schemaVersion": 22,
"schemaVersion": 37,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
"hide": 0,
"includeAll": false,
"label": null,
"multi": true,
"name": "instance",
"options": [],
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
"query": {
"query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)",
"refId": "Prometheus-instance-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(gpu)",
"hide": 0,
"includeAll": true,
"index": -1,
"label": null,
"multi": true,
"name": "gpu",
"options": [],
"query": "label_values(gpu)",
"query": {
"query": "label_values(gpu)",
"refId": "Prometheus-gpu-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
@ -803,8 +804,6 @@
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard",
"uid": "Oxed_c6Wz",
"variables": {
"list": []
},
"version": 1
"version": 1,
"weekStart": ""
}

View File

@ -107,32 +107,12 @@ spec:
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-dcgm-exporter-dashboard
name: additional-grafana-dashboard
labels:
grafana_dashboard: "1"
data:
nvidia-dcgm-exporter-dashboard.json: |
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }}
---
apiVersion: addons.stackhpc.com/v1alpha1
kind: Manifests
metadata:
name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack-dashboards") }}
labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack-dashboards") | nindent 4 }}
spec:
clusterName: {{ include "cluster-addons.clusterName" . }}
bootstrap: true
targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }}
releaseName: kube-prometheus-stack-dashboards
manifestSources:
- template: |
apiVersion: v1
kind: ConfigMap
metadata:
name: nginx-ingress-dashboard
labels:
grafana_dashboard: "1"
data:
{{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }}
nginx-ingress-dashboard.json: |
{{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }}
{{- end }}