diff --git a/charts/cluster-addons/grafana-dashboards/cni-calico-dashboard.json b/charts/cluster-addons/grafana-dashboards/cni-calico-dashboard.json new file mode 100644 index 0000000..f6e212c --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/cni-calico-dashboard.json @@ -0,0 +1,1513 @@ +{ + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.7.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:47", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Felix dashboard is part of calico documentation website, you will have great insight about you Calico instance by using this dashboard.", + "editable": true, + "gnetId": 12175, + "graphTooltip": 0, + "id": null, + "links": [ + { + "icon": "external link", + "includeVars": false, + "tags": [], + "targetBlank": true, + "title": "Calico documentation", + "tooltip": "Comprehensive tutorial on how to use this dashboard.", + "type": "link", + "url": "https://docs.projectcalico.org/master/maintenance/monitor-component-visual" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "panels": [], + "title": "Alerts and general info", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "Prometheus", + "description": "These metrics are part of general information related to your Calico implementation.", + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 2, + "links": [], + "options": { + "fieldOptions": { + "calcs": [ + "lastNotNull" + ], + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "felix_active_local_endpoints", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Active hosts on each node", + "transparent": true, + "type": "gauge" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 8, + "y": 1 + }, + "id": 25, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(felix_iptables_save_errors[5m]))", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "iptables save errors", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 11, + "y": 1 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(felix_ipset_errors[5m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "ipset errors", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 14, + "y": 1 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(felix_cluster_num_hosts)", + "interval": "", + "legendFormat": "Calico node", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Active calico nodes", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "description": "This graph shows you all the errors that Calico encounters, it is important to note occasional errors are acceptable. However, rise in the number of error or constant error counters means Calico is not working properly.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 7, + "x": 17, + "y": 1 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.7.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(felix_ipset_errors[5m])", + "interval": "", + "legendFormat": "{{instance}} ipset errors", + "refId": "A" + }, + { + "expr": "rate(felix_iptables_restore_errors[5m])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}} iptables restore errors", + "refId": "B" + }, + { + "expr": "rate(felix_iptables_save_errors[5m])", + "interval": "", + "legendFormat": "{{instance}} iptables save errors", + "refId": "C" + }, + { + "expr": "rate(felix_log_errors[5m])", + "interval": "", + "legendFormat": "{{instance}} log errors", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Errors plot", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "Prometheus", + "description": "More policies on Felix means more effort required by Calico to manage packets. ", + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 5 + }, + "id": 20, + "options": { + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [], + "values": false + }, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "6.7.3", + "targets": [ + { + "expr": "felix_cluster_num_policies", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Felix cluster policies", + "transparent": true, + "type": "gauge" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 8, + "y": 5 + }, + "id": 29, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(felix_iptables_restore_errors[5m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "iptables restore errors", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 11, + "y": 5 + }, + "id": 26, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(felix_log_errors[5m]))", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Felix log errors", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "Prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 14, + "y": 5 + }, + "id": 24, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.7.3", + "postfix": "", + "postfixFontSize": "200%", + "prefix": "", + "prefixFontSize": "200%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(felix_resyncs_started[5m])) ", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Felix resync started", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 7, + "x": 17, + "y": 5 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_logs_dropped", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Felix dropped logs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": "Prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 14, + "panels": [], + "title": "Dataplane", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "description": "Dataplane apply time can indicate how busy your Kubernetes instance is. This can slow down Calico performance", + "fill": 2, + "fillGradient": 4, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_int_dataplane_apply_time_seconds{quantile=\"0.5\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Dataplane apply time quantile 0.5", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 2, + "fillGradient": 4, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 10 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_int_dataplane_apply_time_seconds{quantile=\"0.9\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Dataplane apply time quantile 0.9", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 2, + "fillGradient": 4, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 10 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_int_dataplane_apply_time_seconds{quantile=\"0.99\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Dataplane apply time quantile 0.99", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 35, + "panels": [], + "title": "Route table", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 18 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_route_table_list_seconds{quantile=\"0.5\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Felix route table list seconds quantile 0.5", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 18 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_route_table_list_seconds{quantile=\"0.9\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Felix route table list seconds quantile 0.9", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 18 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "felix_route_table_list_seconds{quantile=\"0.99\"}", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Felix route table list seconds quantile 0.99", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 22, + "style": "dark", + "tags": [ + "calico", + "felix", + "kubernetes", + "k8s", + "calico-node", + "cloud", + "cluster monitoring", + "policy monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Calico", + "uid": "0ee01b36d7bc5f3d0c9c238c34bcc855", + "variables": { + "list": [] + }, + "version": 1 + } diff --git a/charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json b/charts/cluster-addons/grafana-dashboards/ingress-nginx-dashboard.json similarity index 100% rename from charts/cluster-addons/grafana-dashboards/nginx-ingress-dashboard.json rename to charts/cluster-addons/grafana-dashboards/ingress-nginx-dashboard.json diff --git a/charts/cluster-addons/grafana-dashboards/loki-metrics-dashboard.json b/charts/cluster-addons/grafana-dashboards/loki-metrics-dashboard.json new file mode 100644 index 0000000..bd0ba79 --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/loki-metrics-dashboard.json @@ -0,0 +1,418 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.3.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": 10880, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 8, + "interval": "", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg_over_time(loki_ingester_chunk_entries_count[5m])", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Loki's stored chunk entries", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate (loki_distributor_ingester_appends_total[1m])", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average batch appends sent to ingesters (Loki)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": false, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (promtail_file_bytes_total{instance=~\".*\"}) by (instance)", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of bytes total by promtail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (promtail_encoded_bytes_total{pod=~'.*'}) by (pod)", + "format": "time_series", + "instant": false, + "legendFormat": "{{ pod }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of bytes encoded and ready to send by Promtail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 19, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Metrics", + "uid": "MQHVDmtWk", + "version": 5, + "description": "Loki and Promtail metrics." +} diff --git a/charts/cluster-addons/grafana-dashboards/loki-dashboard.json b/charts/cluster-addons/grafana-dashboards/loki-pod-logs-dashboard.json similarity index 88% rename from charts/cluster-addons/grafana-dashboards/loki-dashboard.json rename to charts/cluster-addons/grafana-dashboards/loki-pod-logs-dashboard.json index 797bc59..e336d86 100644 --- a/charts/cluster-addons/grafana-dashboards/loki-dashboard.json +++ b/charts/cluster-addons/grafana-dashboards/loki-pod-logs-dashboard.json @@ -24,12 +24,6 @@ "name": "Loki", "version": "1.0.0" }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, { "type": "panel", "id": "text", @@ -165,10 +159,10 @@ "templating": { "list": [ { - "allValue": null, + "allValue": ".+", "current": {}, - "datasource": "Prometheus", - "definition": "label_values(kube_pod_info, namespace)", + "datasource": "Loki", + "definition": "label_values(namespace)", "hide": 0, "includeAll": false, "index": -1, @@ -176,7 +170,7 @@ "multi": false, "name": "namespace", "options": [], - "query": "label_values(kube_pod_info, namespace)", + "query": "label_values(namespace)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -188,10 +182,10 @@ "useTags": false }, { - "allValue": ".*", + "allValue": ".+", "current": {}, - "datasource": "Prometheus", - "definition": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "datasource": "Loki", + "definition": "label_values({namespace=~\"$namespace\"}, pod)", "hide": 0, "includeAll": true, "index": -1, @@ -199,7 +193,7 @@ "multi": true, "name": "pod", "options": [], - "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "query": "label_values({namespace=~\"$namespace\"}, pod)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -241,11 +235,11 @@ ] }, "timezone": "", - "title": "Loki / Logs", + "title": "Loki / Pod Logs", "uid": "209fd89b771c318dd442225414a50b59", "variables": { "list": [] }, "version": 1, - "description": "Search logs stored in Loki" + "description": "Search pod logs stored in Loki" } diff --git a/charts/cluster-addons/grafana-dashboards/loki-systemd-logs-dashboard.json b/charts/cluster-addons/grafana-dashboards/loki-systemd-logs-dashboard.json new file mode 100644 index 0000000..4f4bd8b --- /dev/null +++ b/charts/cluster-addons/grafana-dashboards/loki-systemd-logs-dashboard.json @@ -0,0 +1,245 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.7.0" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "logs", + "name": "Logs", + "version": "" + }, + { + "type": "datasource", + "id": "loki", + "name": "Loki", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "Loki", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(count_over_time({unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"[$__interval]))", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:168", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }, + { + "$$hashKey": "object:169", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "Loki", + "gridPos": { + "h": 25, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "maxDataPoints": "", + "options": { + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "expr": "{unit=~\"$unit\", hostname=~\"$hostname\"} |~ \"$search\"", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Logs Panel", + "type": "logs" + } + ], + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".+", + "current": {}, + "datasource": "Loki", + "definition": "label_values(unit)", + "hide": 0, + "includeAll": false, + "index": -1, + "label": null, + "multi": false, + "name": "unit", + "options": [], + "query": "label_values(unit)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": {}, + "datasource": "Loki", + "definition": "label_values({unit=~\"$unit\"}, hostname)", + "hide": 0, + "includeAll": true, + "index": -1, + "label": null, + "multi": true, + "name": "hostname", + "options": [], + "query": "label_values({unit=~\"$unit\"}, hostname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "hide": 0, + "label": null, + "name": "search", + "options": [], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Loki / Systemd Logs", + "uid": "fa1bd43aed803111be9cc923cada9811", + "variables": { + "list": [] + }, + "version": 1, + "description": "Search systemd logs stored in Loki" +} diff --git a/charts/cluster-addons/templates/cni/calico.yaml b/charts/cluster-addons/templates/cni/calico.yaml index 89ae3cd..4d2a987 100644 --- a/charts/cluster-addons/templates/cni/calico.yaml +++ b/charts/cluster-addons/templates/cni/calico.yaml @@ -13,6 +13,8 @@ stringData: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io installation: registry: {{ include "cluster-addons.imagePrefix" . }}docker.io/ + nodeMetricsPort: 9091 + typhaMetricsPort: 9093 calicoNetwork: bgp: Disabled nodeAddressAutodetectionV4: @@ -50,4 +52,110 @@ spec: - secret: name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config key: overrides +{{- if .Values.monitoring.enabled }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "cni-calico-monitoring") }} + labels: {{ include "cluster-addons.componentLabels" (list . "cni-calico-monitoring") | nindent 4 }} + annotations: + # Tell Argo to ignore the non-controller owner references for this object + argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.cni.calico.release.namespace }} + releaseName: cni-calico-monitoring + manifestSources: + # calico-kube-controllers + - template: | + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: calico-kube-controllers-metrics + namespace: calico-system + spec: + endpoints: + - port: metrics-port + namespaceSelector: + matchNames: + - calico-system + selector: + matchLabels: + k8s-app: calico-kube-controllers + # calico-node + - template: | + apiVersion: v1 + kind: Service + metadata: + name: calico-node-metrics + namespace: calico-system + labels: + k8s-app: calico-node + spec: + clusterIP: None + ports: + - name: metrics-port + port: 9091 + selector: + k8s-app: calico-node + --- + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: calico-node-metrics + namespace: calico-system + spec: + endpoints: + - port: metrics-port + namespaceSelector: + matchNames: + - calico-system + selector: + matchLabels: + k8s-app: calico-node + # calico-typha + - template: | + apiVersion: v1 + kind: Service + metadata: + name: calico-typha-metrics + namespace: calico-system + labels: + k8s-app: calico-typha + spec: + clusterIP: None + ports: + - name: metrics-port + port: 9093 + selector: + k8s-app: calico-typha + --- + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: calico-typha-metrics + namespace: calico-system + spec: + endpoints: + - port: metrics-port + namespaceSelector: + matchNames: + - calico-system + selector: + matchLabels: + k8s-app: calico-typha + # dashboard + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: cni-calico-dashboard + labels: + grafana_dashboard: "1" + data: + cni-calico-dashboard.json: | + {{- .Files.Get "grafana-dashboards/cni-calico-dashboard.json" | nindent 12 }} +{{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml index 21cec28..dd4c566 100644 --- a/charts/cluster-addons/templates/ingress-nginx.yaml +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -10,14 +10,12 @@ metadata: stringData: defaults: | controller: - # Indicates whether ingress controller metrics should be included in prometheus + {{- if .Values.monitoring.enabled }} metrics: - # Enable by default if cluster monitoring is enabled - enabled: {{ .Values.monitoring.enabled }} + enabled: true serviceMonitor: - enabled: {{ .Values.monitoring.enabled }} - namespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} - additionalLabels: {{ toYaml .Values.monitoring.serviceMonitorLabels | nindent 12 }} + enabled: true + {{- end }} image: registry: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io admissionWebhooks: @@ -51,4 +49,31 @@ spec: - secret: name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-config key: overrides +{{- if .Values.monitoring.enabled }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "ingress-nginx-dashboards") }} + labels: {{ include "cluster-addons.componentLabels" (list . "ingress-nginx-dashboards") | nindent 4 }} + annotations: + # Tell Argo to ignore the non-controller owner references for this object + argocd.argoproj.io/sync-options: "ControllerReferencesOnly=true" +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.ingress.nginx.release.namespace }} + releaseName: ingress-nginx-dashboards + manifestSources: + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: ingress-nginx-dashboards + labels: + grafana_dashboard: "1" + data: + nginx-ingress-dashboard.json: | + {{- .Files.Get "grafana-dashboards/ingress-nginx-dashboard.json" | nindent 12 }} +{{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml index c51de01..cd9987b 100644 --- a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -10,9 +10,20 @@ metadata: stringData: defaults: | alertmanager: + # Don't apply the namespace grouping by default + config: + route: + group_by: [] alertmanagerSpec: image: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io + # Make sure that alertmanager finds configurations with the alertmanager name as a label + alertmanagerConfigSelector: + matchLabels: + alertmanager: kube-prometheus-stack-alertmanager + # Do NOT add the namespace matcher to routes from AlertmanagerConfig resources + alertmanagerConfigMatcherStrategy: + type: None prometheusOperator: admissionWebhooks: patch: @@ -29,14 +40,28 @@ stringData: prometheusSpec: image: registry: {{ include "cluster-addons.imagePrefix" . }}quay.io - retentionSize: {{ mulf 0.95 .Values.monitoring.prometheusVolumeCapacity }}GB - storageSpec: - volumeClaimTemplate: - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: {{ .Values.monitoring.prometheusVolumeCapacity }}Gi + # Tell Prometheus to pick up all monitors, regardless of labels + podMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + {{- + $storageSize := dig + "prometheus" + "prometheusSpec" + "storageSpec" + "volumeClaimTemplate" + "spec" + "resources" + "requests" + "storage" + "" + .Values.monitoring.kubePrometheusStack.release.values + }} + {{- if $storageSize }} + # Set the retention size to 95% of the given volume size + {{- $storageAmount := mustRegexFind "^([0-9]*[.])?[0-9]+" $storageSize | float64 }} + {{- $storageUnits := mustRegexFind "(K|M|G|T|E|P)i?$" $storageSize }} + retentionSize: {{ mulf 0.95 $storageAmount }}{{ $storageUnits }}B + {{- end }} thanosRuler: thanosRulerSpec: image: @@ -46,13 +71,16 @@ stringData: repository: {{ include "cluster-addons.imagePrefix" . }}registry.k8s.io/kube-state-metrics/kube-state-metrics prometheus-node-exporter: image: - repository: {{ include "cluster-addons.imagePrefix" . }}quay.io/prometheus/node-exporter + registry: {{ include "cluster-addons.imagePrefix" . }}quay.io grafana: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/grafana sidecar: image: repository: {{ include "cluster-addons.imagePrefix" . }}quay.io/kiwigrid/k8s-sidecar + # Tell Grafana to include dashboards from all namespaces + dashboards: + searchNamespace: ALL downloadDashboardsImage: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/curlimages/curl initChownData: @@ -104,14 +132,10 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: additional-grafana-dashboard + name: additional-grafana-dashboards labels: grafana_dashboard: "1" data: nvidia-dcgm-exporter-dashboard.json: | {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev3.json" | nindent 12 }} - {{- if .Values.ingress.enabled }} - nginx-ingress-dashboard.json: | - {{- .Files.Get "grafana-dashboards/nginx-ingress-dashboard.json" | nindent 12 }} - {{- end }} {{- end }} diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml index a28f14d..088ec66 100644 --- a/charts/cluster-addons/templates/monitoring/loki-stack.yaml +++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml @@ -12,12 +12,45 @@ stringData: loki: image: repository: {{ include "cluster-addons.imagePrefix" . }}docker.io/grafana/loki - persistence: + serviceMonitor: enabled: true - size: {{ .Values.monitoring.lokiVolumeCapacity }}Gi promtail: image: registry: {{ include "cluster-addons.imagePrefix" . }}docker.io + serviceMonitor: + enabled: true + # Get promtail to scrape systemd services + config: + snippets: + extraScrapeConfigs: | + - job_name: journal + journal: + path: /var/log/journal + max_age: 12h + labels: + job: systemd-journal + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal__hostname'] + target_label: 'hostname' + - source_labels: ['__journal_priority_keyword'] + target_label: level + # Mount journal directory and machine-id file into promtail pods + extraVolumes: + - name: journal + hostPath: + path: /var/log/journal + - name: machine-id + hostPath: + path: /etc/machine-id + extraVolumeMounts: + - name: journal + mountPath: /var/log/journal + readOnly: true + - name: machine-id + mountPath: /etc/machine-id + readOnly: true grafana: sidecar: datasources: @@ -58,7 +91,7 @@ metadata: spec: clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + targetNamespace: {{ .Values.monitoring.lokiStack.release.namespace }} releaseName: loki-stack-dashboards manifestSources: - template: | @@ -82,10 +115,14 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: loki-stack-grafana-dashboard + name: loki-stack-grafana-dashboards labels: grafana_dashboard: "1" data: - loki-dashboard.json: | - {{- .Files.Get "grafana-dashboards/loki-dashboard.json" | nindent 12 }} + loki-pod-logs-dashboard.json: | + {{- .Files.Get "grafana-dashboards/loki-pod-logs-dashboard.json" | nindent 12 }} + loki-systemd-logs-dashboard.json: | + {{- .Files.Get "grafana-dashboards/loki-systemd-logs-dashboard.json" | nindent 12 }} + loki-metrics-dashboard.json: | + {{- .Files.Get "grafana-dashboards/loki-metrics-dashboard.json" | nindent 12 }} {{- end }} diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml index bf19acf..c30e621 100644 --- a/charts/cluster-addons/values.yaml +++ b/charts/cluster-addons/values.yaml @@ -168,54 +168,29 @@ ingress: monitoring: # Indicates if the cluster monitoring should be enabled enabled: false - # labels to be added to ServiceMonitor resources - # must match labels from .serviceMonitorSelector.matchLabels - # field of Prometheus resource created by kube-prometheus-stack - # in order for Prometheus to scrape metrics from the services - serviceMonitorLabels: - release: kube-prometheus-stack - # Size of the volume in GB to provision on the target cloud for persistent storage of prometheus data - prometheusVolumeCapacity: 10 # Config for the kube-prometheus-stack helm chart # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack kubePrometheusStack: chart: repo: https://prometheus-community.github.io/helm-charts name: kube-prometheus-stack - version: 43.3.1 + version: 45.4.0 release: namespace: monitoring-system - values: {} - - ############################################################################# - # Alertmanager does not come with pre-configured alert sinks so we have to - # write our own (and store it elsewhere to keep credential/secrets hidden). - # - # Example config to send alerts to a slack channel: - # - # Note - 'null' receiver must be include as default kube-prometheus-stack - # alerting rules (specifically the WatchDog alert) require it. - # If it is not included then alertmanager pods will fail to launch - # and errors will be printed in prometheus operator pod logs. - # - # alertmanager: - # enabled: true - # config: - # global: - # slack_api_url: '' - # route: - # receiver: 'slack-notifications' - # group_by: ['namespace'] - # receivers: - # - name: 'null' - # - name: 'slack-notifications' - # slack_configs: - # - channel: '#' - # send_resolved: true - # - name: 'gmail-notifications' - # TODO: Add example here - ############################################################################# - + values: + prometheus: + prometheusSpec: + # Enable persistence by default + # The amount of data that is retained will be 90 days or 95% of the size of the + # persistent volume, whichever is reached first + retention: 90d + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi lokiStack: enabled: true chart: @@ -224,9 +199,12 @@ monitoring: version: 2.8.9 release: namespace: monitoring-system - values: {} - # Size of the volume in GB to provision on the target cloud for persistent storage of loki data - lokiVolumeCapacity: 10 + values: + loki: + # Enable persistence by default + persistence: + enabled: true + size: 10Gi # Settings for node feature discovery # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery diff --git a/skopeo-manifests/kube-prometheus-stack.yml b/skopeo-manifests/kube-prometheus-stack.yml index 9c94b45..8100bd9 100644 --- a/skopeo-manifests/kube-prometheus-stack.yml +++ b/skopeo-manifests/kube-prometheus-stack.yml @@ -1,28 +1,28 @@ docker.io: images: grafana/grafana: - - 9.3.1 + - 9.3.6 quay.io: images: kiwigrid/k8s-sidecar: - - 1.21.0 + - 1.22.0 prometheus/alertmanager: - v0.25.0 prometheus/node-exporter: - v1.5.0 prometheus/prometheus: - - v2.40.5 + - v2.42.0 prometheus-operator/prometheus-config-reloader: - - v0.61.1 + - v0.63.0 prometheus-operator/prometheus-operator: - - v0.61.1 + - v0.63.0 thanos/thanos: - - v0.29.0 + - v0.30.2 registry.k8s.io: images: ingress-nginx/kube-webhook-certgen: - - v1.3.0 + - v20221220-controller-v1.5.1-58-g787ea74b6 kube-state-metrics/kube-state-metrics: - - v2.7.0 + - v2.8.0