diff --git a/.gitignore b/.gitignore index e48b0fc..7042f36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ charts/*/charts -Chart.lock \ No newline at end of file +Chart.lock +.vscode \ No newline at end of file diff --git a/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json new file mode 100644 index 0000000..a87a323 --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json @@ -0,0 +1,1265 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "enable": true, + "expr": "sum(changes(nginx_ingress_controller_config_last_reload_successful_timestamp_seconds{instance!=\"unknown\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[30s])) by (controller_class)", + "hide": false, + "iconColor": "rgba(255, 96, 96, 1)", + "limit": 100, + "name": "Config Reloads", + "showIn": 0, + "step": "30s", + "tagKeys": "controller_class", + "tags": [], + "titleFormat": "Config Reloaded", + "type": "tags" + } + ] + }, + "description": "Ingress-nginx supports a rich collection of prometheus metrics. If you have prometheus and grafana installed on your cluster then prometheus will already be scraping this data due to the scrape annotation on the deployment.", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 9614, + "graphTooltip": 0, + "id": 29, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "id": 20, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m])), 0.001)", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Request Volume", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 82, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 95 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 21, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",status!~\"[4-5].*\"}[2m])) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "Controller Success Rate (non-4|5xx responses)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 81, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_success{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Config Reloads", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 83, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count(nginx_ingress_controller_config_last_reload_successful{controller_pod=~\"$controller\",controller_namespace=~\"$namespace\"} == 0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 4 + } + ], + "title": "Last Config Failed", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 3 + }, + "height": "200px", + "hiddenSeries": false, + "id": 86, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "round(sum(irate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress), 0.001)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ ingress }}", + "metric": "network", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Ingress Request Volume", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "reqps", + "logBase": 1, + "show": true + }, + { + "format": "Bps", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00", + "max - prometheus": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 87, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\",status!~\"[4-5].*\"}[2m])) by (ingress) / sum(rate(nginx_ingress_controller_nginx_process_requests_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",namespace=~\"$namespace\",ingress=~\"$ingress\"}[2m])) by (ingress)", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ ingress }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Ingress Success Rate (non-4|5xx responses)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 10 + }, + "height": "200px", + "hiddenSeries": false, + "id": 32, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (irate (nginx_ingress_controller_nginx_process_read_bytes_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "- sum (irate (nginx_ingress_controller_response_size_sum{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "format": "time_series", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Network I/O pressure", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "Bps", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00", + "max - prometheus": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 2, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 10 + }, + "hiddenSeries": false, + "id": 77, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}) ", + "format": "time_series", + "instant": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "nginx", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Average Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": { + "max - istio-proxy": "#890f02", + "max - master": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "decimals": 3, + "editable": false, + "error": false, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 10 + }, + "height": "", + "hiddenSeries": false, + "id": 79, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum (rate (nginx_ingress_controller_nginx_process_cpu_seconds_total{controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m])) ", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "nginx", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt" + } + ], + "timeRegions": [], + "title": "Average CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "columns": [ + { + "$$hashKey": "object:336", + "text": "Current", + "value": "current" + } + ], + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fontSize": "100%", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "height": "1024", + "id": 85, + "links": [], + "pageSize": 7, + "scroll": true, + "showHeader": true, + "sort": { + "col": 1, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "TTL", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "pattern": "Current", + "thresholds": [ + "0", + "691200" + ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg(nginx_ingress_controller_ssl_expire_time_seconds{kubernetes_pod_name=~\"$controller\",namespace=~\"$namespace\",ingress=~\"$ingress\"}) by (host) - time()", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ host }}", + "metric": "gke_letsencrypt_cert_expiration", + "refId": "A", + "step": 1 + } + ], + "title": "Ingress Certificate Expiry", + "transform": "timeseries_aggregations", + "type": "table-old" + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "nginx" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash, controller_namespace)", + "refId": "Prometheus-namespace-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Controller Class", + "multi": false, + "name": "controller_class", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\"}, controller_class) ", + "refId": "Prometheus-controller_class-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Controller", + "multi": false, + "name": "controller", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_config_hash{namespace=~\"$namespace\",controller_class=~\"$controller_class\"}, controller_pod) ", + "refId": "Prometheus-controller-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Ingress", + "multi": false, + "name": "ingress", + "options": [], + "query": { + "query": "label_values(nginx_ingress_controller_requests{namespace=~\"$namespace\",controller_class=~\"$controller_class\",controller=~\"$controller\"}, ingress) ", + "refId": "Prometheus-ingress-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "2m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "NGINX Ingress controller", + "uid": "nginx", + "version": 1, + "weekStart": "" +} diff --git a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json similarity index 72% rename from charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json rename to charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json index 40f89d8..a811c8e 100644 --- a/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json +++ b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json @@ -1,68 +1,51 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "6.7.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - } - ], "annotations": { "list": [ { "$$hashKey": "object:192", "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, - "id": null, - "iteration": 1588401887165, + "id": 33, "links": [], + "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -88,9 +71,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -100,6 +84,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "instant": false, "interval": "", @@ -108,9 +96,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Temperature", "tooltip": { "shared": true, @@ -119,37 +105,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "celsius", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -158,54 +167,30 @@ }, "id": 14, "options": { - "fieldOptions": { + "orientation": "auto", + "reduceOptions": { "calcs": [ "mean" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 83 - }, - { - "color": "red", - "value": 87 - } - ] - }, - "unit": "celsius" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Avg. Temp", "type": "gauge" }, @@ -214,7 +199,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -240,10 +234,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pluginVersion": "6.5.2", + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -253,6 +247,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -260,9 +258,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Power Usage", "tooltip": { "shared": true, @@ -271,38 +267,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "watt", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "cacheTimeout": null, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -312,55 +330,30 @@ "id": 16, "links": [], "options": { - "fieldOptions": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ "sum" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 2400, - "min": 0, - "nullValueMode": "connected", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1800 - }, - { - "color": "red", - "value": 2200 - } - ] - }, - "unit": "watt" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "horizontal", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.3.1", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"${instance}\", gpu=~\"${gpu}\"})", "interval": "", "legendFormat": "", "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Power Total", "type": "gauge" }, @@ -369,7 +362,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -389,7 +391,6 @@ "min": false, "rightSide": true, "show": true, - "sideWidth": null, "total": false, "values": true }, @@ -397,9 +398,10 @@ "linewidth": 2, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "9.3.1", "pointradius": 2, "points": false, "renderer": "flot", @@ -409,6 +411,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"${instance}\", gpu=~\"${gpu}\"} * 1000000", "format": "time_series", "interval": "", @@ -418,9 +424,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU SM Clocks", "tooltip": { "shared": true, @@ -429,34 +433,25 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "decimals": null, "format": "hertz", "label": "", "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -464,7 +459,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -502,6 +500,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -509,9 +511,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "GPU Utilization", "tooltip": { "shared": true, @@ -520,16 +520,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", - "label": null, "logBase": 1, "max": "100", "min": "0", @@ -537,16 +534,12 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -554,7 +547,10 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -564,95 +560,6 @@ "y": 32 }, "hiddenSeries": false, - "id": 18, - "legend": { - "avg": true, - "current": false, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", - "interval": "", - "legendFormat": "GPU {{gpu}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Framebuffer Mem Used", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decmbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, "id": 4, "legend": { "alignAsTable": true, @@ -681,6 +588,10 @@ "steppedLine": false, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"${instance}\", gpu=~\"${gpu}\"}", "interval": "", "legendFormat": "GPU {{gpu}}", @@ -688,9 +599,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Tensor Core Utilization", "tooltip": { "shared": true, @@ -699,16 +608,13 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, "max": "1", "min": "0", @@ -716,66 +622,161 @@ }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "DCGM_FI_DEV_FB_USED{instance=~\"${instance}\", gpu=~\"${gpu}\"}", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GPU Framebuffer Mem Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decmbytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false } } ], "refresh": false, - "schemaVersion": 22, + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { "list": [ { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", "hide": 0, "includeAll": false, - "label": null, "multi": true, "name": "instance", "options": [], - "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "query": { + "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "refId": "Prometheus-instance-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "definition": "label_values(gpu)", "hide": 0, "includeAll": true, - "index": -1, - "label": null, "multi": true, "name": "gpu", "options": [], - "query": "label_values(gpu)", + "query": { + "query": "label_values(gpu)", + "refId": "Prometheus-gpu-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -803,8 +804,6 @@ "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", - "variables": { - "list": [] - }, - "version": 1 + "version": 1, + "weekStart": "" } diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index 3960e17..21cec28 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -10,6 +10,14 @@ metadata: stringData: defaults: | controller: + # Indicates whether ingress controller metrics should be included in prometheus + metrics: + # Enable by default if cluster monitoring is enabled + enabled: {{ .Values.monitoring.enabled }} + serviceMonitor: + enabled: {{ .Values.monitoring.enabled }} + namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }} image: registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io admissionWebhooks: diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index fb801c4..c51de01 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -29,6 +29,14 @@ stringData: prometheusSpec: image: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io + retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi thanosRuler: thanosRulerSpec: image: @@ -96,10 +104,14 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: nvidia-dcgm-exporter-dashboard + name: additional-grafana-dashboard labels: grafana_dashboard: "1" data: nvidia-dcgm-exporter-dashboard.json: | - {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} + {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} + {{- if .Values.ingress.enabled }} + nginx-ingress-dashboard.json: | + {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} + {{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml index 2ae29c2..a28f14d 100644 --- a/charts/cluster-addons/templates/monitoring/loki-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml @@ -12,6 +12,9 @@ stringData: loki: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki + persistence: + enabled: true + size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi promtail: image: registry: {{ include "cluster-addons.imagePrefix" . }}docker.io diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index 64c6caf..bf19acf 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -163,12 +163,21 @@ ingress: version: 4.4.2 release: namespace: ingress-nginx - values: {} # Settings for cluster monitoring monitoring: # Indicates if the cluster monitoring should be enabled enabled: false + # labels to be added to ServiceMonitor resources + # must match labels from .serviceMonitorSelector.matchLabels + # field of Prometheus resource created by kube-prometheus-stack + # in order for Prometheus to scrape metrics from the services + serviceMonitorLabels: + release: kube-prometheus-stack + # Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data + prometheusVolumeCapacity: 10 + # Config for the kube-prometheus-stack helm chart + # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack kubePrometheusStack: chart: repo: https://prometheus-community.github.io/helm-charts @@ -177,6 +186,36 @@ monitoring: release: namespace: monitoring-system values: {} + + ############################################################################# + # Alertmanager does not come with pre-configured alert sinks so we have to + # write our own (and store it elsewhere to keep credential/secrets hidden). + # + # Example config to send alerts to a slack channel: + # + # Note - 'null' receiver must be include as default kube-prometheus-stack + # alerting rules (specifically the WatchDog alert) require it. + # If it is not included then alertmanager pods will fail to launch + # and errors will be printed in prometheus operator pod logs. + # + # alertmanager: + # enabled: true + # config: + # global: + # slack_api_url: '' + # route: + # receiver: 'slack-notifications' + # group_by: ['namespace'] + # receivers: + # - name: 'null' + # - name: 'slack-notifications' + # slack_configs: + # - channel: '#' + # send_resolved: true + # - name: 'gmail-notifications' + # TODO: Add example here + ############################################################################# + lokiStack: enabled: true chart: @@ -186,6 +225,8 @@ monitoring: release: namespace: monitoring-system values: {} + # Size of the volume in GB to provision on the target cloud for persistent storage of loki data + lokiVolumeCapacity: 10 # Settings for node feature discovery # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery