diff --git a/prometheus/templates/configmap-bin.yaml b/prometheus/templates/configmap-bin.yaml
index 6a7b32040e..096e1f1344 100644
--- a/prometheus/templates/configmap-bin.yaml
+++ b/prometheus/templates/configmap-bin.yaml
@@ -20,7 +20,7 @@ limitations under the License.
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: prometheus-bin
+  name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
 data:
   apache.sh: |
 {{ tuple "bin/_apache.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
diff --git a/prometheus/templates/configmap-etc.yaml b/prometheus/templates/configmap-etc.yaml
index 38314a9445..025add07ec 100644
--- a/prometheus/templates/configmap-etc.yaml
+++ b/prometheus/templates/configmap-etc.yaml
@@ -16,34 +16,14 @@ limitations under the License.
 
 {{- if .Values.manifests.configmap_etc }}
 {{- $envAll := . }}
-
-{{- if empty $envAll.Values.conf.prometheus.scrape_configs.rule_files -}}
-{{- $_ := set $envAll.Values "__rule_files" ( list ) }}
-{{- $rulesKeys := keys $envAll.Values.conf.prometheus.rules -}}
-{{- range $rule := $rulesKeys }}
-{{- $rulesFile := printf "/etc/config/rules/%s.rules" $rule }}
-{{- $__rule_files := append $envAll.Values.__rule_files $rulesFile }}
-{{- $_ := set $envAll.Values "__rule_files" $__rule_files }}
-{{ end }}
-{{- $_ := set .Values.conf.prometheus.scrape_configs "rule_files" $envAll.Values.__rule_files -}}
-{{- end -}}
-
-{{- if not (empty $envAll.Values.conf.prometheus.scrape_configs.scrape_configs) }}
-{{- $_ := set $envAll.Values "__updated_scrape_configs" ( list ) }}
-{{- $promScrapeTarget := first $envAll.Values.conf.prometheus.scrape_configs.scrape_configs }}
-{{- if (empty $promScrapeTarget.basic_auth) }}
-{{- $_ := set $promScrapeTarget "basic_auth" $envAll.Values.endpoints.monitoring.auth.admin }}
-{{- end }}
-{{- end }}
-
 ---
 apiVersion: v1
 kind: Secret
 metadata:
-  name: prometheus-etc
+  name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
 type: Opaque
 data:
-  prometheus.yml: {{ toYaml .Values.conf.prometheus.scrape_configs | b64enc }}
+{{- include "helm-toolkit.snippets.values_template_renderer" (dict "envAll" $envAll "template" .Values.conf.prometheus.scrape_configs.template "key" "prometheus.yml" "format" "Secret") | indent 2 }}
 {{ range $key, $value := .Values.conf.prometheus.rules }}
   {{ $key }}.rules: {{ toYaml $value | b64enc }}
 {{ end }}
diff --git a/prometheus/templates/pod-helm-tests.yaml b/prometheus/templates/pod-helm-tests.yaml
index e3986c852e..38dab678d8 100644
--- a/prometheus/templates/pod-helm-tests.yaml
+++ b/prometheus/templates/pod-helm-tests.yaml
@@ -16,7 +16,6 @@ limitations under the License.
 
 {{- if .Values.manifests.helm_tests }}
 {{- $envAll := . }}
-{{- $promUserSecret := .Values.secrets.prometheus.admin }}
 
 {{- $serviceAccountName := print .Release.Name "-test" }}
 {{ tuple $envAll "tests" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
@@ -47,12 +46,12 @@ spec:
         - name: PROMETHEUS_ADMIN_USERNAME
           valueFrom:
             secretKeyRef:
-              name: {{ $promUserSecret }}
+              name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
               key: PROMETHEUS_ADMIN_USERNAME
         - name: PROMETHEUS_ADMIN_PASSWORD
           valueFrom:
             secretKeyRef:
-              name: {{ $promUserSecret }}
+              name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
               key: PROMETHEUS_ADMIN_PASSWORD
         - name: PROMETHEUS_ENDPOINT
           value: {{ tuple "monitoring" "internal" "http" $envAll | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}
@@ -68,6 +67,6 @@ spec:
       emptyDir: {}
     - name: prometheus-bin
       configMap:
-        name: prometheus-bin
+        name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
         defaultMode: 0555
 {{- end }}
diff --git a/prometheus/templates/secret-prometheus.yaml b/prometheus/templates/secret-prometheus.yaml
index 8e41346aa2..558126b5d6 100644
--- a/prometheus/templates/secret-prometheus.yaml
+++ b/prometheus/templates/secret-prometheus.yaml
@@ -16,12 +16,11 @@ limitations under the License.
 
 {{- if .Values.manifests.secret_prometheus }}
 {{- $envAll := . }}
-{{- $secretName := index $envAll.Values.secrets.prometheus.admin }}
 ---
 apiVersion: v1
 kind: Secret
 metadata:
-  name: {{ $secretName }}
+  name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
 type: Opaque
 data:
   PROMETHEUS_ADMIN_USERNAME: {{ .Values.endpoints.monitoring.auth.admin.username | b64enc }}
diff --git a/prometheus/templates/statefulset.yaml b/prometheus/templates/statefulset.yaml
index 1185a6069b..1df6bebf0b 100644
--- a/prometheus/templates/statefulset.yaml
+++ b/prometheus/templates/statefulset.yaml
@@ -19,15 +19,14 @@ limitations under the License.
 
 {{- $mounts_prometheus := .Values.pod.mounts.prometheus.prometheus }}
 {{- $mounts_prometheus_init := .Values.pod.mounts.prometheus.init_container }}
-{{- $promUserSecret := .Values.secrets.prometheus.admin }}
 
-{{- $serviceAccountName := printf "%s-%s" .Release.Name "prometheus" }}
-{{ tuple $envAll "prometheus" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
+{{- $rcControllerName := printf "%s-%s" $envAll.Release.Name "prometheus" }}
+{{ tuple $envAll "prometheus" $rcControllerName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
 ---
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRole
 metadata:
-  name: {{ $serviceAccountName }}
+  name: {{ $rcControllerName | quote }}
 rules:
   - apiGroups:
       - ""
@@ -55,20 +54,20 @@ rules:
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRoleBinding
 metadata:
-  name: {{ $serviceAccountName }}
+  name: {{ $rcControllerName | quote }}
 subjects:
   - kind: ServiceAccount
-    name: {{ $serviceAccountName }}
+    name: {{ $rcControllerName | quote }}
     namespace: {{ .Release.Namespace }}
 roleRef:
   kind: ClusterRole
-  name: {{ $serviceAccountName }}
+  name: {{ $rcControllerName | quote }}
   apiGroup: rbac.authorization.k8s.io
 ---
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
-  name: prometheus
+  name: {{ $rcControllerName | quote }}
   annotations:
     {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
   labels:
@@ -90,7 +89,7 @@ spec:
         configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
     spec:
 {{ dict "envAll" $envAll "application" "api" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
-      serviceAccountName: {{ $serviceAccountName }}
+      serviceAccountName: {{ $rcControllerName | quote }}
       affinity:
 {{ tuple $envAll "prometheus" "api" | include "helm-toolkit.snippets.kubernetes_pod_anti_affinity" | indent 8 }}
       nodeSelector:
@@ -129,12 +128,12 @@ spec:
             - name: PROMETHEUS_ADMIN_USERNAME
               valueFrom:
                 secretKeyRef:
-                  name: {{ $promUserSecret }}
+                  name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
                   key: PROMETHEUS_ADMIN_USERNAME
             - name: PROMETHEUS_ADMIN_PASSWORD
               valueFrom:
                 secretKeyRef:
-                  name: {{ $promUserSecret }}
+                  name: {{ printf "%s-%s" $envAll.Release.Name "admin-user" | quote }}
                   key: PROMETHEUS_ADMIN_PASSWORD
           volumeMounts:
             - name: pod-tmp
@@ -169,6 +168,10 @@ spec:
               port: {{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
             initialDelaySeconds: 30
             timeoutSeconds: 30
+          env:
+{{- if .Values.pod.env.prometheus }}
+{{ include "helm-toolkit.utils.to_k8s_env_vars" .Values.pod.env.prometheus | indent 12 }}
+{{- end }}
           volumeMounts:
             - name: pod-tmp
               mountPath: /tmp
@@ -202,11 +205,11 @@ spec:
           emptyDir: {}
         - name: prometheus-etc
           secret:
-            secretName: prometheus-etc
+            secretName: {{ printf "%s-%s" $envAll.Release.Name "prometheus-etc" | quote }}
             defaultMode: 0444
         - name: prometheus-bin
           configMap:
-            name: prometheus-bin
+            name: {{ printf "%s-%s" $envAll.Release.Name "prometheus-bin" | quote }}
             defaultMode: 0555
 {{ if $mounts_prometheus.volumes }}{{ toYaml $mounts_prometheus.volumes | indent 8 }}{{ end }}
 {{- if not .Values.storage.enabled }}
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index d20d593795..85b272af1a 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -43,6 +43,8 @@ labels:
     node_selector_value: enabled
 
 pod:
+  env:
+    prometheus: null
   security_context:
     api:
       pod:
@@ -238,8 +240,6 @@ secrets:
     monitoring:
       prometheus:
         public: prometheus-tls-public
-  prometheus:
-    admin: prometheus-admin-creds
 
 storage:
   enabled: true
@@ -346,6 +346,24 @@ conf:
     </IfModule>
 
     <VirtualHost *:80>
+      # Expose metrics to all users, as this is not sensitive information and
+      # circumvents the inability of Prometheus to interpolate environment vars
+      # in its configuration file
+      <Location /metrics>
+          ProxyPass http://localhost:{{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/metrics
+          ProxyPassReverse http://localhost:{{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/metrics
+          Satisfy Any
+          Allow from all
+      </Location>
+      # Expose the /federate endpoint to all users, as this is also not
+      # sensitive information and circumvents the inability of Prometheus to
+      # interpolate environment vars in its configuration file
+      <Location /federate>
+          ProxyPass http://localhost:{{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/metrics
+          ProxyPassReverse http://localhost:{{ tuple "monitoring" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/metrics
+          Satisfy Any
+          Allow from all
+      </Location>
       # Restrict general user (LDAP) access to the /graph endpoint, as general trusted
       # users should only be able to query Prometheus for metrics and not have access
       # to information like targets, configuration, flags or build info for Prometheus
@@ -486,1779 +504,560 @@ conf:
       # If set to true, allows for http reloads and shutdown of Prometheus
       web.enable_lifecycle: false
     scrape_configs:
-      global:
-        scrape_interval: 60s
-        evaluation_interval: 60s
-      scrape_configs:
-        # NOTE(srwilkers): The job definition for Prometheus should always be
-        # listed first, so we can inject the basic auth username and password
-        # via the endpoints section
-        - job_name: 'prometheus-metrics'
-          kubernetes_sd_configs:
-          - role: endpoints
+      template: |
+        {{- $promHost := tuple "monitoring" "public" . | include "helm-toolkit.endpoints.hostname_fqdn_endpoint_lookup" }}
+        {{- if not (empty .Values.conf.prometheus.rules)}}
+        rule_files:
+        {{- $rulesKeys := keys .Values.conf.prometheus.rules -}}
+        {{- range $rule := $rulesKeys }}
+          {{ printf "- /etc/config/rules/%s.rules" $rule }}
+        {{- end }}
+        {{- end }}
+        global:
           scrape_interval: 60s
-          relabel_configs:
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: keep
-            regex: "prom-metrics"
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scrape
-            action: keep
-            regex: true
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scheme
-            action: replace
-            target_label: __scheme__
-            regex: (https?)
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_path
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels:
-              - __address__
-              - __meta_kubernetes_service_annotation_prometheus_io_port
-            action: replace
-            target_label: __address__
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels:
-              - __meta_kubernetes_namespace
-            action: replace
-            target_label: kubernetes_namespace
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: replace
-            target_label: instance
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: replace
-            target_label: kubernetes_name
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: job
-            replacement: ${1}
-        - job_name: kubelet
-          scheme: https
-          # This TLS & bearer token file config is used to connect to the actual scrape
-          # endpoints for cluster components. This is separate to discovery auth
-          # configuration because discovery & scraping are two separate concerns in
-          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-          # the cluster. Otherwise, more config options have to be provided within the
-          # <kubernetes_sd_config>.
-          tls_config:
-            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-          kubernetes_sd_configs:
-          - role: node
-          scrape_interval: 45s
-          relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels:
-              - __meta_kubernetes_node_name
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/${1}/proxy/metrics
-          - source_labels:
-              - __meta_kubernetes_node_name
-            action: replace
-            target_label: kubernetes_io_hostname
-          # Scrape config for Kubelet cAdvisor.
-          #
-          # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
-          # (those whose names begin with 'container_') have been removed from the
-          # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
-          # retrieve those metrics.
-          #
-          # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
-          # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
-          # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
-          # the --cadvisor-port=0 Kubelet flag).
-          #
-          # This job is not necessary and should be removed in Kubernetes 1.6 and
-          # earlier versions, or it will cause the metrics to be scraped twice.
-        - job_name: 'kubernetes-cadvisor'
-
-          # Default to scraping over https. If required, just disable this or change to
-          # `http`.
-          scheme: https
-
-          # This TLS & bearer token file config is used to connect to the actual scrape
-          # endpoints for cluster components. This is separate to discovery auth
-          # configuration because discovery & scraping are two separate concerns in
-          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-          # the cluster. Otherwise, more config options have to be provided within the
-          # <kubernetes_sd_config>.
-          tls_config:
-            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-          kubernetes_sd_configs:
-          - role: node
-
-          relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels:
-              - __meta_kubernetes_node_name
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
-          metric_relabel_configs:
-          - source_labels:
-              - __name__
-            regex: 'container_network_tcp_usage_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_tasks_state'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_udp_usage_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_failures_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_cpu_load_average_10s'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_cpu_system_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_cpu_user_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_inodes_free'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_inodes_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_io_current'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_io_time_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_io_time_weighted_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_read_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_reads_merged_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_reads_merged_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_reads_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_sector_reads_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_sector_writes_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_write_seconds_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_writes_bytes_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_writes_merged_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_fs_writes_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_last_seen'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_cache'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_failcnt'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_max_usage_bytes'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_rss'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_swap'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_memory_usage_bytes'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_receive_errors_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_receive_packets_dropped_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_receive_packets_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_transmit_errors_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_transmit_packets_dropped_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_network_transmit_packets_total'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_spec_cpu_period'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_spec_cpu_shares'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_spec_memory_limit_bytes'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_spec_memory_reservation_limit_bytes'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_spec_memory_swap_limit_bytes'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'container_start_time_seconds'
-            action: drop
-          # Scrape config for API servers.
-          #
-          # Kubernetes exposes API servers as endpoints to the default/kubernetes
-          # service so this uses `endpoints` role and uses relabelling to only keep
-          # the endpoints associated with the default/kubernetes service using the
-          # default named port `https`. This works for single API server deployments as
-          # well as HA API server deployments.
-        - job_name: 'apiserver'
-          kubernetes_sd_configs:
-          - role: endpoints
-          scrape_interval: 45s
-          # Default to scraping over https. If required, just disable this or change to
-          # `http`.
-          scheme: https
-          # This TLS & bearer token file config is used to connect to the actual scrape
-          # endpoints for cluster components. This is separate to discovery auth
-          # configuration because discovery & scraping are two separate concerns in
-          # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-          # the cluster. Otherwise, more config options have to be provided within the
-          # <kubernetes_sd_config>.
-          tls_config:
-            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-            # If your node certificates are self-signed or use a different CA to the
-            # master CA, then disable certificate verification below. Note that
-            # certificate verification is an integral part of a secure infrastructure
-            # so this should only be disabled in a controlled environment. You can
-            # disable certificate verification by uncommenting the line below.
+          evaluation_interval: 60s
+          external_labels:
+            prometheus_host: {{$promHost}}
+        scrape_configs:
+          - job_name: kubelet
+            scheme: https
+            # This TLS & bearer token file config is used to connect to the actual scrape
+            # endpoints for cluster components. This is separate to discovery auth
+            # configuration because discovery & scraping are two separate concerns in
+            # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+            # the cluster. Otherwise, more config options have to be provided within the
+            # <kubernetes_sd_config>.
+            tls_config:
+              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+            kubernetes_sd_configs:
+            - role: node
+            scrape_interval: 45s
+            relabel_configs:
+            - action: labelmap
+              regex: __meta_kubernetes_node_label_(.+)
+            - target_label: __address__
+              replacement: kubernetes.default.svc:443
+            - source_labels:
+                - __meta_kubernetes_node_name
+              regex: (.+)
+              target_label: __metrics_path__
+              replacement: /api/v1/nodes/${1}/proxy/metrics
+            - source_labels:
+                - __meta_kubernetes_node_name
+              action: replace
+              target_label: kubernetes_io_hostname
+            # Scrape config for Kubelet cAdvisor.
             #
-            # insecure_skip_verify: true
-          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-          # Keep only the default/kubernetes service endpoints for the https port. This
-          # will add targets for each API server which Kubernetes adds an endpoint to
-          # the default/kubernetes service.
-          relabel_configs:
-          - source_labels:
-              - __meta_kubernetes_namespace
-              - __meta_kubernetes_service_name
-              - __meta_kubernetes_endpoint_port_name
-            action: keep
-            regex: default;kubernetes;https
-          metric_relabel_configs:
-          - source_labels:
-              - __name__
-            regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'rest_client_request_latency_seconds_bucket'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'apiserver_response_sizes_bucket'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'apiserver_admission_step_admission_latencies_seconds_bucket'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'apiserver_admission_controller_admission_latencies_seconds_count'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'apiserver_admission_controller_admission_latencies_seconds_sum'
-            action: drop
-          - source_labels:
-              - __name__
-            regex: 'apiserver_request_latencies_summary'
-            action: drop
-        # Scrape config for service endpoints.
-        #
-        # The relabeling allows the actual service scrape endpoint to be configured
-        # via the following annotations:
-        #
-        # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
-        # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-        # to set this to `https` & most likely set the `tls_config` of the scrape config.
-        # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-        # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-        # service then set this appropriately.
-        - job_name: 'openstack-exporter'
-          kubernetes_sd_configs:
-          - role: endpoints
-          scrape_interval: 60s
-          relabel_configs:
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: keep
-            regex: "openstack-metrics"
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scrape
-            action: keep
-            regex: true
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scheme
-            action: replace
-            target_label: __scheme__
-            regex: (https?)
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_path
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels:
-              - __address__
-              - __meta_kubernetes_service_annotation_prometheus_io_port
-            action: replace
-            target_label: __address__
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels:
-              - __meta_kubernetes_namespace
-            action: replace
-            target_label: kubernetes_namespace
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: replace
-            target_label: instance
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: replace
-            target_label: kubernetes_name
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: job
-            replacement: ${1}
-        - job_name: 'node-exporter'
-          kubernetes_sd_configs:
-          - role: endpoints
-          scrape_interval: 60s
-          relabel_configs:
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: keep
-            regex: 'node-exporter'
-          - source_labels:
-              - __meta_kubernetes_pod_node_name
-            action: replace
-            target_label: hostname
-        - job_name: 'kubernetes-service-endpoints'
-          kubernetes_sd_configs:
-          - role: endpoints
-          scrape_interval: 60s
-          relabel_configs:
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: drop
-            regex: '(openstack-metrics|prom-metrics|ceph-mgr|node-exporter)'
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scrape
-            action: keep
-            regex: true
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_scheme
-            action: replace
-            target_label: __scheme__
-            regex: (https?)
-          - source_labels:
-              - __meta_kubernetes_service_annotation_prometheus_io_path
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels:
-              - __address__
-              - __meta_kubernetes_service_annotation_prometheus_io_port
-            action: replace
-            target_label: __address__
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels:
-              - __meta_kubernetes_namespace
-            action: replace
-            target_label: kubernetes_namespace
-          - source_labels:
-              - __meta_kubernetes_service_name
-            action: replace
-            target_label: kubernetes_name
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: job
-            replacement: ${1}
-        # Example scrape config for pods
-        #
-        # The relabeling allows the actual pod scrape endpoint to be configured via the
-        # following annotations:
-        #
-        # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
-        # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-        # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
-        # pod's declared ports (default is a port-free target if none are declared).
-        - job_name: 'kubernetes-pods'
-          kubernetes_sd_configs:
-          - role: pod
-          relabel_configs:
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
-            action: keep
-            regex: true
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
-            action: replace
-            regex: ([^:]+)(?::\d+)?;(\d+)
-            replacement: $1:$2
-            target_label: __address__
-          - action: labelmap
-            regex: __meta_kubernetes_pod_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: kubernetes_namespace
-          - source_labels: [__meta_kubernetes_pod_name]
-            action: replace
-            target_label: kubernetes_pod_name
-        - job_name: calico-etcd
-          kubernetes_sd_configs:
-          - role: service
-          scrape_interval: 20s
-          relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - action: keep
-            source_labels:
-              - __meta_kubernetes_service_name
-            regex: "calico-etcd"
-          - action: keep
-            source_labels:
-              - __meta_kubernetes_namespace
-            regex: kube-system
-            target_label: namespace
-          - source_labels:
-              - __meta_kubernetes_pod_name
-            target_label: pod
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: service
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: job
-            replacement: ${1}
-          - source_labels:
-              - __meta_kubernetes_service_label
-            target_label: job
-            regex: calico-etcd
-            replacement: ${1}
-          - target_label: endpoint
-            replacement: "calico-etcd"
-        - job_name: ceph-mgr
-          kubernetes_sd_configs:
-          - role: service
-          scrape_interval: 20s
-          relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - action: keep
-            source_labels:
-              - __meta_kubernetes_service_name
-            regex: "ceph-mgr"
-          - source_labels:
-              - __meta_kubernetes_service_port_name
-            action: drop
-            regex: 'ceph-mgr'
-          - action: keep
-            source_labels:
-              - __meta_kubernetes_namespace
-            regex: ceph
-            target_label: namespace
-          - source_labels:
-              - __meta_kubernetes_pod_name
-            target_label: pod
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: service
-          - source_labels:
-              - __meta_kubernetes_service_name
-            target_label: job
-            replacement: ${1}
-          - source_labels:
-              - __meta_kubernetes_service_label
-            target_label: job
-            regex: ceph-mgr
-            replacement: ${1}
-          - target_label: endpoint
-            replacement: "ceph-mgr"
-      alerting:
-        alertmanagers:
-        - kubernetes_sd_configs:
+            # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
+            # (those whose names begin with 'container_') have been removed from the
+            # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
+            # retrieve those metrics.
+            #
+            # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
+            # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
+            # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
+            # the --cadvisor-port=0 Kubelet flag).
+            #
+            # This job is not necessary and should be removed in Kubernetes 1.6 and
+            # earlier versions, or it will cause the metrics to be scraped twice.
+          - job_name: 'kubernetes-cadvisor'
+
+            # Default to scraping over https. If required, just disable this or change to
+            # `http`.
+            scheme: https
+
+            # This TLS & bearer token file config is used to connect to the actual scrape
+            # endpoints for cluster components. This is separate to discovery auth
+            # configuration because discovery & scraping are two separate concerns in
+            # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+            # the cluster. Otherwise, more config options have to be provided within the
+            # <kubernetes_sd_config>.
+            tls_config:
+              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+            kubernetes_sd_configs:
+            - role: node
+
+            relabel_configs:
+            - action: labelmap
+              regex: __meta_kubernetes_node_label_(.+)
+            - target_label: __address__
+              replacement: kubernetes.default.svc:443
+            - source_labels:
+                - __meta_kubernetes_node_name
+              regex: (.+)
+              target_label: __metrics_path__
+              replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+            metric_relabel_configs:
+            - source_labels:
+                - __name__
+              regex: 'container_network_tcp_usage_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_tasks_state'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_udp_usage_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_failures_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_cpu_load_average_10s'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_cpu_system_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_cpu_user_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_inodes_free'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_inodes_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_io_current'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_io_time_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_io_time_weighted_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_read_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_reads_merged_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_reads_merged_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_reads_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_sector_reads_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_sector_writes_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_write_seconds_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_writes_bytes_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_writes_merged_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_fs_writes_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_last_seen'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_cache'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_failcnt'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_max_usage_bytes'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_rss'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_swap'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_memory_usage_bytes'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_receive_errors_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_receive_packets_dropped_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_receive_packets_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_transmit_errors_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_transmit_packets_dropped_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_network_transmit_packets_total'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_spec_cpu_period'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_spec_cpu_shares'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_spec_memory_limit_bytes'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_spec_memory_reservation_limit_bytes'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_spec_memory_swap_limit_bytes'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'container_start_time_seconds'
+              action: drop
+            # Scrape config for API servers.
+            #
+            # Kubernetes exposes API servers as endpoints to the default/kubernetes
+            # service so this uses `endpoints` role and uses relabelling to only keep
+            # the endpoints associated with the default/kubernetes service using the
+            # default named port `https`. This works for single API server deployments as
+            # well as HA API server deployments.
+          - job_name: 'apiserver'
+            kubernetes_sd_configs:
+            - role: endpoints
+            scrape_interval: 45s
+            # Default to scraping over https. If required, just disable this or change to
+            # `http`.
+            scheme: https
+            # This TLS & bearer token file config is used to connect to the actual scrape
+            # endpoints for cluster components. This is separate to discovery auth
+            # configuration because discovery & scraping are two separate concerns in
+            # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+            # the cluster. Otherwise, more config options have to be provided within the
+            # <kubernetes_sd_config>.
+            tls_config:
+              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+              # If your node certificates are self-signed or use a different CA to the
+              # master CA, then disable certificate verification below. Note that
+              # certificate verification is an integral part of a secure infrastructure
+              # so this should only be disabled in a controlled environment. You can
+              # disable certificate verification by uncommenting the line below.
+              #
+              # insecure_skip_verify: true
+            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+            # Keep only the default/kubernetes service endpoints for the https port. This
+            # will add targets for each API server which Kubernetes adds an endpoint to
+            # the default/kubernetes service.
+            relabel_configs:
+            - source_labels:
+                - __meta_kubernetes_namespace
+                - __meta_kubernetes_service_name
+                - __meta_kubernetes_endpoint_port_name
+              action: keep
+              regex: default;kubernetes;https
+            metric_relabel_configs:
+            - source_labels:
+                - __name__
+              regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'rest_client_request_latency_seconds_bucket'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'apiserver_response_sizes_bucket'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'apiserver_admission_step_admission_latencies_seconds_bucket'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'apiserver_admission_controller_admission_latencies_seconds_count'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'apiserver_admission_controller_admission_latencies_seconds_sum'
+              action: drop
+            - source_labels:
+                - __name__
+              regex: 'apiserver_request_latencies_summary'
+              action: drop
+          # Scrape config for service endpoints.
+          #
+          # The relabeling allows the actual service scrape endpoint to be configured
+          # via the following annotations:
+          #
+          # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+          # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+          # to set this to `https` & most likely set the `tls_config` of the scrape config.
+          # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+          # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+          # service then set this appropriately.
+          - job_name: 'openstack-exporter'
+            kubernetes_sd_configs:
+            - role: endpoints
+            scrape_interval: 60s
+            relabel_configs:
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: keep
+              regex: "openstack-metrics"
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_scrape
+              action: keep
+              regex: true
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_scheme
+              action: replace
+              target_label: __scheme__
+              regex: (https?)
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_path
+              action: replace
+              target_label: __metrics_path__
+              regex: (.+)
+            - source_labels:
+                - __address__
+                - __meta_kubernetes_service_annotation_prometheus_io_port
+              action: replace
+              target_label: __address__
+              regex: ([^:]+)(?::\d+)?;(\d+)
+              replacement: $1:$2
+            - action: labelmap
+              regex: __meta_kubernetes_service_label_(.+)
+            - source_labels:
+                - __meta_kubernetes_namespace
+              action: replace
+              target_label: kubernetes_namespace
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: replace
+              target_label: instance
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: replace
+              target_label: kubernetes_name
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: job
+              replacement: ${1}
+          - job_name: 'node-exporter'
+            kubernetes_sd_configs:
+            - role: endpoints
+            scrape_interval: 60s
+            relabel_configs:
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: keep
+              regex: 'node-exporter'
+            - source_labels:
+                - __meta_kubernetes_pod_node_name
+              action: replace
+              target_label: hostname
+          - job_name: 'kubernetes-service-endpoints'
+            kubernetes_sd_configs:
+            - role: endpoints
+            scrape_interval: 60s
+            relabel_configs:
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: drop
+              regex: '(openstack-metrics|prom-metrics|ceph-mgr|node-exporter)'
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_scrape
+              action: keep
+              regex: true
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_scheme
+              action: replace
+              target_label: __scheme__
+              regex: (https?)
+            - source_labels:
+                - __meta_kubernetes_service_annotation_prometheus_io_path
+              action: replace
+              target_label: __metrics_path__
+              regex: (.+)
+            - source_labels:
+                - __address__
+                - __meta_kubernetes_service_annotation_prometheus_io_port
+              action: replace
+              target_label: __address__
+              regex: ([^:]+)(?::\d+)?;(\d+)
+              replacement: $1:$2
+            - action: labelmap
+              regex: __meta_kubernetes_service_label_(.+)
+            - source_labels:
+                - __meta_kubernetes_namespace
+              action: replace
+              target_label: kubernetes_namespace
+            - source_labels:
+                - __meta_kubernetes_service_name
+              action: replace
+              target_label: kubernetes_name
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: job
+              replacement: ${1}
+          # Example scrape config for pods
+          #
+          # The relabeling allows the actual pod scrape endpoint to be configured via the
+          # following annotations:
+          #
+          # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
+          # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+          # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
+          # pod's declared ports (default is a port-free target if none are declared).
+          - job_name: 'kubernetes-pods'
+            kubernetes_sd_configs:
             - role: pod
-          tls_config:
-            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-          relabel_configs:
-          - source_labels: [__meta_kubernetes_pod_label_application]
-            regex: alertmanager
-            action: keep
-          - source_labels: [__meta_kubernetes_pod_container_port_name]
-            regex: alerts-api
-            action: keep
-          - source_labels: [__meta_kubernetes_pod_container_port_name]
-            regex: peer-mesh
-            action: drop
-    rules:
-      alertmanager:
-        groups:
-        - name: alertmanager.rules
-          rules:
-          - alert: AlertmanagerConfigInconsistent
-            expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
-              summary: Alertmanager configurations are inconsistent
-          - alert: AlertmanagerDownOrMissing
-            expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
-              summary: Alertmanager down or not discovered
-          - alert: FailedReload
-            expr: alertmanager_config_last_reload_successful == 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.
-              summary: Alertmanager configuration reload has failed
-      etcd3:
-        groups:
-        - name: etcd3.rules
-          rules:
-          - alert: etcd_InsufficientMembers
-            expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
-            for: 3m
-            labels:
-              severity: critical
-            annotations:
-              description: If one more etcd member goes down the cluster will be unavailable
-              summary: etcd cluster insufficient members
-          - alert: etcd_NoLeader
-            expr: etcd_server_has_leader{job="etcd"} == 0
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: etcd member {{ $labels.instance }} has no leader
-              summary: etcd member has no leader
-          - alert: etcd_HighNumberOfLeaderChanges
-            expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
-            labels:
-              severity: warning
-            annotations:
-              description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
-              summary: a high number of leader changes within the etcd cluster are happening
-          - alert: etcd_HighNumberOfFailedGRPCRequests
-            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
-              summary: a high number of gRPC requests are failing
-          - alert: etcd_HighNumberOfFailedGRPCRequests
-            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
-              summary: a high number of gRPC requests are failing
-          - alert: etcd_GRPCRequestsSlow
-            expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
-            for: 10m
-            labels:
-              severity: critical
-            annotations:
-              description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
-              summary: slow gRPC requests
-          - alert: etcd_HighNumberOfFailedHTTPRequests
-            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
-              summary: a high number of HTTP requests are failing
-          - alert: etcd_HighNumberOfFailedHTTPRequests
-            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
-              summary: a high number of HTTP requests are failing
-          - alert: etcd_HTTPRequestsSlow
-            expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
-              summary: slow HTTP requests
-          - alert: etcd_EtcdMemberCommunicationSlow
-            expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
-              summary: etcd member communication is slow
-          - alert: etcd_HighNumberOfFailedProposals
-            expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
-            labels:
-              severity: warning
-            annotations:
-              description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
-              summary: a high number of proposals within the etcd cluster are failing
-          - alert: etcd_HighFsyncDurations
-            expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: etcd instance {{ $labels.instance }} fync durations are high
-              summary: high fsync durations
-          - alert: etcd_HighCommitDurations
-            expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: etcd instance {{ $labels.instance }} commit durations are high
-              summary: high commit durations
-      kube_apiserver:
-        groups:
-        - name: kube-apiserver.rules
-          rules:
-          - alert: K8SApiserverDown
-            expr: absent(up{job="apiserver"} == 1)
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
-              summary: API server unreachable
-          - alert: K8SApiServerLatency
-            expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
-              summary: Kubernetes apiserver latency is high
-      kube_controller_manager:
-        groups:
-        - name: kube-controller-manager.rules
-          rules:
-          - alert: K8SControllerManagerDown
-            expr: absent(up{job="kube-controller-manager-discovery"} == 1)
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
-              runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
-              summary: Controller manager is down
-      kubelet:
-        groups:
-        - name: kubelet.rules
-          rules:
-          - alert: K8SNodeNotReady
-            expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
-              summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
-          - alert: K8SManyNodesNotReady
-            expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
-              summary: Many Kubernetes nodes are Not Ready
-          - alert: K8SManyNodesNotReady
-            expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
-              summary: Many Kubernetes nodes are Not Ready
-          - alert: K8SNodesNotReady
-            expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: '{{ $value }} nodes are notReady state.'
-              summary: One or more Kubernetes nodes are Not Ready
-          - alert: K8SKubeletDown
-            expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: Prometheus failed to scrape {{ $value }}% of kubelets.
-              summary: Many Kubelets cannot be scraped
-          - alert: K8SKubeletDown
-            expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
-              summary: Many Kubelets cannot be scraped
-          - alert: K8SKubeletTooManyPods
-            expr: kubelet_running_pod_count > 100
-            labels:
-              severity: warning
-            annotations:
-              description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
-              summary: Kubelet is close to pod limit
-      kubernetes:
-        groups:
-        - name: kubernetes.rules
-          rules:
-          - alert: kube_statefulset_replicas_unavailable
-            expr: kube_statefulset_status_replicas < kube_statefulset_replicas
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
-              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
-          - alert: daemonsets_misscheduled
-            expr: kube_daemonset_status_number_misscheduled > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
-              summary: 'Daemonsets not scheduled correctly'
-          - alert: daemonsets_not_scheduled
-            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
-              summary: 'Less than desired number of daemonsets scheduled'
-          - alert: daemonset_pods_unavailable
-            expr: kube_daemonset_status_number_unavailable > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
-              summary: 'Daemonset pods unavailable, due to one of many reasons'
-          - alert: deployment_replicas_unavailable
-            expr: kube_deployment_status_replicas_unavailable > 0
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
-              summary: '{{$labels.deployment}}: has inssuficient replicas.'
-          - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
-            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
-              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
-          - alert: job_status_failed
-            expr: kube_job_status_failed > 0
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Job {{$labels.exported_job}} is in failed status'
-              summary: '{{$labels.exported_job}} has failed status'
-          - alert: pod_status_pending
-            expr: kube_pod_status_phase{phase="Pending"} == 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
-          - alert: pod_error_image_pull
-            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: pod_status_error_image_pull_backoff
-            expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: pod_error_crash_loop_back_off
-            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: pod_error_config_error
-            expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: replicaset_missing_replicas
-            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
-              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
-          - alert: pod_container_terminated
-            expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
-              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
-          - alert: volume_claim_capacity_high_utilization
-            expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
-              summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
-      basic_linux:
-        groups:
-        - name: basic_linux.rules
-          rules:
-          - alert: node_filesystem_full_80percent
-            expr: sort(node_filesystem_free{fstype =~ "xfs|ext[34]"} < node_filesystem_size{fstype =~ "xfs|ext[34]"}
-              * 0.2) / 1024 ^ 3
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
-                got less than 10% space left on its filesystem.'
-              summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
-          - alert: node_filesystem_full_in_4h
-            expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
-                is running out of space of in approx. 4 hours'
-              summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
-          - alert: node_filedescriptors_full_in_3h
-            expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
-            for: 20m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} is running out of available file descriptors
-                in approx. 3 hours'
-              summary: '{{$labels.alias}} is running out of available file descriptors in
-                3 hours.'
-          - alert: node_load1_90percent
-            expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
-            for: 1h
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} is running with > 90% total load for at least
-                1h.'
-              summary: '{{$labels.alias}}: Running on high load.'
-          - alert: node_cpu_util_90percent
-            expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
-            for: 1h
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} has total CPU utilization over 90% for at least
-                1h.'
-              summary: '{{$labels.alias}}: High CPU utilization.'
-          - alert: node_ram_using_90percent
-            expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
-              * 0.1
-            for: 30m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} is using at least 90% of its RAM for at least
-                30 minutes now.'
-              summary: '{{$labels.alias}}: Using lots of RAM.'
-          - alert: node_swap_using_80percent
-            expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
-              > node_memory_SwapTotal * 0.8
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} is using 80% of its swap space for at least
-                10 minutes now.'
-              summary: '{{$labels.alias}}: Running out of swap soon.'
-          - alert: node_high_cpu_load
-            expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
-            for: 1m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
-              summary: '{{$labels.alias}}: Running on high load: {{$value}}'
-          - alert: node_high_memory_load
-            expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
-              + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
-            for: 1m
-            labels:
-              severity: warning
-            annotations:
-              description: Host memory usage is {{ humanize $value }}%. Reported by
-                instance {{ $labels.instance }} of job {{ $labels.job }}.
-              summary: Server memory is almost full
-          - alert: node_high_storage_load
-            expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
-              / node_filesystem_size{mountpoint="/"} * 100 > 85
-            for: 30s
-            labels:
-              severity: warning
-            annotations:
-              description: Host storage usage is {{ humanize $value }}%. Reported by
-                instance {{ $labels.instance }} of job {{ $labels.job }}.
-              summary: Server storage is almost full
-          - alert: node_high_swap
-            expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
-              * 0.4)
-            for: 1m
-            labels:
-              severity: warning
-            annotations:
-              description: Host system has a high swap usage of {{ humanize $value }}. Reported
-                by instance {{ $labels.instance }} of job {{ $labels.job }}.
-              summary: Server has a high swap usage
-          - alert: node_high_network_drop_rcv
-            expr: node_network_receive_drop{device!="lo"} > 3000
-            for: 30s
-            labels:
-              severity: warning
-            annotations:
-              description: Host system has an unusally high drop in network reception ({{
-                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
-                $labels.job }}
-              summary: Server has a high receive drop
-          - alert: node_high_network_drop_send
-            expr: node_network_transmit_drop{device!="lo"} > 3000
-            for: 30s
-            labels:
-              severity: warning
-            annotations:
-              description: Host system has an unusally high drop in network transmission ({{
-                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
-                $labels.job }}
-              summary: Server has a high transmit drop
-          - alert: node_high_network_errs_rcv
-            expr: node_network_receive_errs{device!="lo"} > 3000
-            for: 30s
-            labels:
-              severity: warning
-            annotations:
-              description: Host system has an unusally high error rate in network reception
-                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
-                {{ $labels.job }}
-              summary: Server has unusual high reception errors
-          - alert: node_high_network_errs_send
-            expr: node_network_transmit_errs{device!="lo"} > 3000
-            for: 30s
-            labels:
-              severity: warning
-            annotations:
-              description: Host system has an unusally high error rate in network transmission
-                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
-                {{ $labels.job }}
-              summary: Server has unusual high transmission errors
-          - alert: node_network_conntrack_usage_80percent
-            expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"}  * 0.8)
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
-              summary: '{{$labels.instance}}: available network conntrack entries are low.'
-          - alert: node_entropy_available_low
-            expr: node_entropy_available_bits < 300
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
-              summary: '{{$labels.instance}}: is low on entropy bits.'
-          - alert: node_hwmon_high_cpu_temp
-            expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
-              summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
-          - alert: node_vmstat_paging_rate_high
-            expr: irate(node_vmstat_pgpgin[5m]) > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
-              summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
-          - alert: node_xfs_block_allocation_high
-            expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
-              summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
-          - alert: node_network_bond_slaves_down
-            expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
-              summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
-          - alert: node_numa_memory_used
-            expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
-              summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
-          - alert: node_ntp_clock_skew_high
-            expr: abs(node_ntp_drift_seconds) > 2
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
-              summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
-          - alert: node_disk_read_latency
-            expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.device}} has a high read latency of {{ $value }}'
-              summary: 'High read latency observed for device {{ $labels.device }}'
-          - alert: node_disk_write_latency
-            expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: '{{$labels.device}} has a high write latency of {{ $value }}'
-              summary: 'High write latency observed for device {{ $labels.device }}'
-      openstack:
-        groups:
-        - name: openstack.rules
-          rules:
-          - alert: os_glance_api_availability
-            expr:  openstack_check_glance_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Glance API is not available at {{$labels.url}}'
-          - alert: os_nova_api_availability
-            expr:  openstack_check_nova_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Nova API is not available at {{$labels.url}}'
-          - alert: os_keystone_api_availability
-            expr:  openstack_check_keystone_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Keystone API is not available at {{$labels.url}}'
-          - alert: os_neutron_api_availability
-            expr:  openstack_check_neutron_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Neutron API is not available at {{$labels.url}}'
-          - alert: os_neutron_metadata_agent_availability
-            expr:  openstack_services_neutron_metadata_agent_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
-              summary: 'One or more neutron metadata_agents are not available'
-          - alert: os_neutron_openvswitch_agent_availability
-            expr:  openstack_services_neutron_openvswitch_agent_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
-              summary: 'One or more neutron openvswitch agents are not available'
-          - alert: os_neutron_dhcp_agent_availability
-            expr:  openstack_services_neutron_dhcp_agent_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
-              summary: 'One or more neutron dhcp agents are not available'
-          - alert: os_neutron_l3_agent_availability
-            expr:  openstack_services_neutron_l3_agent_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'One or more neutron L3 agents are not available for more than 5 minutes'
-              summary: 'One or more neutron L3 agents are not available'
-          - alert: os_swift_api_availability
-            expr:  openstack_check_swift_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Swift API is not available at {{$labels.url}}'
-          - alert: os_cinder_api_availability
-            expr:  openstack_check_cinder_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Cinder API is not available at {{$labels.url}}'
-          - alert: os_cinder_scheduler_availability
-            expr:  openstack_services_cinder_cinder_scheduler != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Cinder scheduler is not available for more than 5 minutes'
-              summary: 'Cinder scheduler is not available'
-          - alert: os_heat_api_availability
-            expr:  openstack_check_heat_api != 1
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
-              summary: 'Heat API is not available at {{$labels.url}}'
-          - alert: os_nova_compute_disabled
-            expr:  openstack_services_nova_compute_disabled_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-compute is disabled on some hosts'
-          - alert: os_nova_conductor_disabled
-            expr:  openstack_services_nova_conductor_disabled_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
-          - alert: os_nova_consoleauth_disabled
-            expr:  openstack_services_nova_consoleauth_disabled_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
-          - alert: os_nova_scheduler_disabled
-            expr:  openstack_services_nova_scheduler_disabled_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
-          - alert: os_nova_compute_down
-            expr:  openstack_services_nova_compute_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-compute is down on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-compute is down on some hosts'
-          - alert: os_nova_conductor_down
-            expr:  openstack_services_nova_conductor_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-conductor is down on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-conductor is down on some hosts'
-          - alert: os_nova_consoleauth_down
-            expr:  openstack_services_nova_consoleauth_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-consoleauth is down on some hosts'
-          - alert: os_nova_scheduler_down
-            expr:  openstack_services_nova_scheduler_down_total > 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
-              summary: 'Openstack compute service nova-scheduler is down on some hosts'
-          - alert: os_vm_vcpu_usage_high
-            expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
-              summary: 'Openstack VM vcpu usage is high'
-          - alert: os_vm_ram_usage_high
-            expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Openstack VM RAM usage is hight at {{$value}} percent'
-              summary: 'Openstack VM RAM usage is high'
-          - alert: os_vm_disk_usage_high
-            expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Openstack VM Disk usage is hight at {{$value}} percent'
-              summary: 'Openstack VM Disk usage is high'
-      ceph:
-        groups:
-        - name: ceph.rules
-          rules:
-          - alert: no_active_ceph_mgr
-            expr: count(up{job="ceph-mgr"} == 1) == 0
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: 'no ceph active mgr is present or all ceph mgr are down'
-              summary: 'no ceph active mgt is present'
-          - alert: ceph_mon_quorum_low
-            expr:  ceph_mon_quorum_count < 3
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
-              summary: 'ceph high availability is at risk'
-          - alert: ceph_cluster_usage_high
-            expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'ceph cluster capacity usage more than 80 percent'
-              summary: 'ceph cluster usage is more than 80 percent'
-          - alert: ceph_placement_group_degrade_pct_high
-            expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: 'ceph placement group degradation is more than 80 percent'
-              summary: 'ceph placement groups degraded'
-          - alert: ceph_osd_down_pct_high
-            expr:  100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
-            for: 5m
-            labels:
-              severity: critical
-            annotations:
-              description: 'ceph OSDs down percent is more than 80 percent'
-              summary: 'ceph OSDs down percent is high'
-          - alert: ceph_osd_down
-            expr: ceph_osd_up == 0
-            for: 1m
-            labels:
-              severity: critical
-            annotations:
-              description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
-              summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
-          - alert: ceph_osd_out
-            expr: ceph_osd_in == 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
-              summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
-      fluentd:
-        groups:
-        - name: fluentd.rules
-          rules:
-          - alert: fluentd_not_running
-            expr:  fluentd_up == 0
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
-              summary: 'Fluentd is down'
-      calico:
-        groups:
-        - name: calico.rules
-          rules:
-          - alert: calico_datapane_failures_high_1h
-            expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
-              summary: 'A high number of dataplane failures within Felix are happening'
-          - alert: calico_datapane_address_msg_batch_size_high_5m
-            expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
-              summary: 'Felix address message batch size is higher'
-          - alert: calico_datapane_iface_msg_batch_size_high_5m
-            expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
-            for: 5m
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
-              summary: 'Felix interface message batch size is higher'
-          - alert: calico_ipset_errors_high_1h
-            expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
-              summary: 'A high number of ipset errors within Felix are happening'
-          - alert: calico_iptable_save_errors_high_1h
-            expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
-              summary: 'A high number of iptable save errors within Felix are happening'
-          - alert: calico_iptable_restore_errors_high_1h
-            expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
-            labels:
-              severity: page
-            annotations:
-              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
-              summary: 'A high number of iptable restore errors within Felix are happening'
-      rabbitmq:
-        groups:
-        - name: rabbitmq.rules
-          rules:
-          - alert: rabbitmq_network_pratitions_detected
-            expr: min(partitions) by(instance) > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
-              summary: 'RabbitMQ Network partitions detected'
-          - alert: rabbitmq_down
-            expr:  min(rabbitmq_up) by(instance) != 1
-            for: 10m
-            labels:
-              severity: page
-            annotations:
-              description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
-              summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
-          - alert: rabbitmq_file_descriptor_usage_high
-            expr:  fd_used * 100 /fd_total > 80
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
-              summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
-          - alert: rabbitmq_node_disk_free_alarm
-            expr:  node_disk_free_alarm > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
-              summary: 'RabbitMQ disk space usage is high'
-          - alert: rabbitmq_node_memory_alarm
-            expr:  node_mem_alarm > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
-              summary: 'RabbitMQ memory usage is high'
-          - alert: rabbitmq_less_than_3_nodes
-            expr:  running < 3
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ Server has less than 3 nodes running.'
-              summary: 'RabbitMQ server is at risk of loosing data'
-          - alert: rabbitmq_queue_messages_returned_high
-            expr:  queue_messages_returned_total/queue_messages_published_total * 100 > 50
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
-              summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
-          - alert: rabbitmq_consumers_low_utilization
-            expr:  queue_consumer_utilisation < .4
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ consumers message consumption speed is low'
-              summary: 'RabbitMQ consumers message consumption speed is low'
-          - alert: rabbitmq_high_message_load
-            expr:  queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
-              summary: 'RabbitMQ has high message load'
-      elasticsearch:
-        groups:
-        - name: elasticsearch.rules
-          rules:
-          - alert: es_high_process_open_files_count
-            expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
-              summary: 'Elasticsearch has a very high process open file count.'
-          - alert: es_high_process_cpu_percent
-            expr: elasticsearch_process_cpu_percent > 95
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
-              summary: 'Elasticsearch process cpu usage is more than 95 percent.'
-          - alert: es_fs_usage_high
-            expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
-              summary: 'Elasticsearch filesystem usage is high.'
-          - alert: es_unassigned_shards
-            expr: elasticsearch_cluster_health_unassigned_shards > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch has {{ $value }} unassigned shards.'
-              summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
-          - alert: es_cluster_health_timed_out
-            expr: elasticsearch_cluster_health_timed_out > 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
-              summary: 'Elasticsearch cluster health status calls are timing out.'
-          - alert: es_cluster_health_status_alert
-            expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
-              summary: 'Elasticsearch cluster health status is not green.'
-          - alert: es_cluster_health_too_few_nodes_running
-            expr: elasticsearch_cluster_health_number_of_nodes < 3
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
-              summary: 'ElasticSearch running on less than 3 nodes'
-          - alert: es_cluster_health_too_few_data_nodes_running
-            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
-              summary: 'ElasticSearch running on less than 3 data nodes'
-          - alert: es_cluster_health_too_few_data_nodes_running
-            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
-              summary: 'ElasticSearch running on less than 3 data nodes'
-      mariadb:
-        groups:
-        - name: mariadb.rules
-          rules:
-          - alert: mariadb_table_lock_wait_high
-            expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'Mariadb has high table lock waits of {{ $value }} percentage'
-              summary: 'Mariadb table lock waits are high'
-          - alert: mariadb_node_not_ready
-            expr:  mysql_global_status_wsrep_ready != 1
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
-              summary: 'Galera cluster node not ready'
-          - alert: mariadb_galera_node_out_of_sync
-            expr:  mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
-              summary: 'Galera cluster node out of sync'
-          - alert: mariadb_innodb_replication_fallen_behind
-            expr:  (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: 'The mysql innodb replication has fallen behind and is not recovering'
-              summary: 'MySQL innodb replication is lagging'
-      postgresql:
-        groups:
-        - name: postgresql.rules
-          rules:
-          - alert: pg_replication_fallen_behind
-            expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica ==  1)
-            for: 5m
-            labels:
-              severity: warning
-            annotations:
-              description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
-              title: Postgres Replication lag is over 2 minutes
-          - alert: pg_connections_too_high
-            expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
-            for: 5m
-            labels:
-              severity: warn
-              channel: database
-            annotations:
-              title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
-          - alert: pg_deadlocks_detected
-            expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
-            for: 5m
-            labels:
-              severity: warn
-            annotations:
-              description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
-              title: Postgres server is experiencing deadlocks
-      prometheus_exporters:
-        groups:
-        - name: prometheus_exporters.rules
-          rules:
-          - alert: prom_exporter_ceph_unavailable
-            expr: absent(ceph_health_status)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
-              title: Ceph exporter is not collecting metrics or is not available
-          - alert: prom_exporter_openstack_unavailable
-            expr: absent(openstack_exporter_cache_refresh_duration_seconds)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
-              title: Openstack exporter is not collecting metrics or is not available
-          - alert: prom_exporter_mariadb_unavailable
-            expr: absent(mysql_up)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
-              title: MariaDB exporter is not collecting metrics or is not available
-          - alert: prom_exporter_kube_state_metrics_unavailable
-            expr: absent(kube_node_info)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
-              title: kube-state-metrics exporter is not collecting metrics or is not available
-          - alert: prom_exporter_postgresql_unavailable
-            expr: absent(pg_static)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
-              title: postgresql exporter is not collecting metrics or is not available
-          - alert: prom_exporter_node_unavailable
-            expr: absent(node_uname_info)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: node exporter is not collecting metrics or is not available for past 10 minutes
-              title: node exporter is not collecting metrics or is not available
-          - alert: prom_exporter_calico_unavailable
-            expr: absent(felix_host)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Calico exporter is not collecting metrics or is not available for past 10 minutes
-              title: Calico exporter is not collecting metrics or is not available
-          - alert: prom_exporter_elasticsearch_unavailable
-            expr: absent(elasticsearch_cluster_health_status)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
-              title: Elasticsearch exporter is not collecting metrics or is not available
-          - alert: prom_exporter_fluentd_unavailable
-            expr: absent(fluentd_up)
-            for: 10m
-            labels:
-              severity: warning
-            annotations:
-              description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
-              title: Fluentd exporter is not collecting metrics or is not available
+            relabel_configs:
+            - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+              action: keep
+              regex: true
+            - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+              action: replace
+              target_label: __metrics_path__
+              regex: (.+)
+            - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+              action: replace
+              regex: ([^:]+)(?::\d+)?;(\d+)
+              replacement: $1:$2
+              target_label: __address__
+            - action: labelmap
+              regex: __meta_kubernetes_pod_label_(.+)
+            - source_labels: [__meta_kubernetes_namespace]
+              action: replace
+              target_label: kubernetes_namespace
+            - source_labels: [__meta_kubernetes_pod_name]
+              action: replace
+              target_label: kubernetes_pod_name
+          - job_name: calico-etcd
+            kubernetes_sd_configs:
+            - role: service
+            scrape_interval: 20s
+            relabel_configs:
+            - action: labelmap
+              regex: __meta_kubernetes_service_label_(.+)
+            - action: keep
+              source_labels:
+                - __meta_kubernetes_service_name
+              regex: "calico-etcd"
+            - action: keep
+              source_labels:
+                - __meta_kubernetes_namespace
+              regex: kube-system
+              target_label: namespace
+            - source_labels:
+                - __meta_kubernetes_pod_name
+              target_label: pod
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: service
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: job
+              replacement: ${1}
+            - source_labels:
+                - __meta_kubernetes_service_label
+              target_label: job
+              regex: calico-etcd
+              replacement: ${1}
+            - target_label: endpoint
+              replacement: "calico-etcd"
+          - job_name: ceph-mgr
+            kubernetes_sd_configs:
+            - role: service
+            scrape_interval: 20s
+            relabel_configs:
+            - action: labelmap
+              regex: __meta_kubernetes_service_label_(.+)
+            - action: keep
+              source_labels:
+                - __meta_kubernetes_service_name
+              regex: "ceph-mgr"
+            - source_labels:
+                - __meta_kubernetes_service_port_name
+              action: drop
+              regex: 'ceph-mgr'
+            - action: keep
+              source_labels:
+                - __meta_kubernetes_namespace
+              regex: ceph
+              target_label: namespace
+            - source_labels:
+                - __meta_kubernetes_pod_name
+              target_label: pod
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: service
+            - source_labels:
+                - __meta_kubernetes_service_name
+              target_label: job
+              replacement: ${1}
+            - source_labels:
+                - __meta_kubernetes_service_label
+              target_label: job
+              regex: ceph-mgr
+              replacement: ${1}
+            - target_label: endpoint
+              replacement: "ceph-mgr"
+        alerting:
+          alertmanagers:
+          - kubernetes_sd_configs:
+              - role: pod
+            tls_config:
+              ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+            relabel_configs:
+            - source_labels: [__meta_kubernetes_pod_label_application]
+              regex: alertmanager
+              action: keep
+            - source_labels: [__meta_kubernetes_pod_container_port_name]
+              regex: alerts-api
+              action: keep
+            - source_labels: [__meta_kubernetes_pod_container_port_name]
+              regex: peer-mesh
+              action: drop
+    rules: []
diff --git a/prometheus/values_overrides/alertmanager.yaml b/prometheus/values_overrides/alertmanager.yaml
new file mode 100644
index 0000000000..8e6572e848
--- /dev/null
+++ b/prometheus/values_overrides/alertmanager.yaml
@@ -0,0 +1,31 @@
+conf:
+  prometheus:
+    rules:
+      alertmanager:
+        groups:
+        - name: alertmanager.rules
+          rules:
+          - alert: AlertmanagerConfigInconsistent
+            expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
+              summary: Alertmanager configurations are inconsistent
+          - alert: AlertmanagerDownOrMissing
+            expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
+              summary: Alertmanager down or not discovered
+          - alert: FailedReload
+            expr: alertmanager_config_last_reload_successful == 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod }}.
+              summary: Alertmanager configuration reload has failed
diff --git a/prometheus/values_overrides/ceph.yaml b/prometheus/values_overrides/ceph.yaml
new file mode 100644
index 0000000000..91e8e98d7b
--- /dev/null
+++ b/prometheus/values_overrides/ceph.yaml
@@ -0,0 +1,71 @@
+conf:
+  prometheus:
+    rules:
+      ceph:
+        groups:
+        - name: ceph.rules
+          rules:
+          - alert: prom_exporter_ceph_unavailable
+            expr: absent(ceph_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Ceph exporter is not collecting metrics or is not available for past 10 minutes
+              title: Ceph exporter is not collecting metrics or is not available
+          - alert: no_active_ceph_mgr
+            expr: count(up{job="ceph-mgr"} == 1) == 0
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'no ceph active mgr is present or all ceph mgr are down'
+              summary: 'no ceph active mgt is present'
+          - alert: ceph_mon_quorum_low
+            expr:  ceph_mon_quorum_count < 3
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
+              summary: 'ceph high availability is at risk'
+          - alert: ceph_cluster_usage_high
+            expr: 100* ceph_cluster_total_used_bytes/ceph_cluster_total_bytes > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph cluster capacity usage more than 80 percent'
+              summary: 'ceph cluster usage is more than 80 percent'
+          - alert: ceph_placement_group_degrade_pct_high
+            expr: 100 * sum(ceph_pg_degraded)/sum(ceph_osd_numpg) > 80
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: 'ceph placement group degradation is more than 80 percent'
+              summary: 'ceph placement groups degraded'
+          - alert: ceph_osd_down_pct_high
+            expr:  100 * count(ceph_osd_up==0)/count(ceph_osd_metadata) > 80
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: 'ceph OSDs down percent is more than 80 percent'
+              summary: 'ceph OSDs down percent is high'
+          - alert: ceph_osd_down
+            expr: ceph_osd_up == 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
+              summary: 'ceph OSD {{ $labels.ceph_daemon }} is down in instance {{ $labels.instance }}'
+          - alert: ceph_osd_out
+            expr: ceph_osd_in == 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
+              summary: 'ceph OSD {{ $labels.ceph_daemon }} is out in instance {{ $labels.instance }}'
diff --git a/prometheus/values_overrides/kubernetes.yaml b/prometheus/values_overrides/kubernetes.yaml
new file mode 100644
index 0000000000..dd15f1a3e3
--- /dev/null
+++ b/prometheus/values_overrides/kubernetes.yaml
@@ -0,0 +1,379 @@
+conf:
+  prometheus:
+    rules:
+      kubernetes:
+        groups:
+        - name: calico.rules
+          rules:
+          - alert: prom_exporter_calico_unavailable
+            expr: absent(felix_host)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Calico exporter is not collecting metrics or is not available for past 10 minutes
+              title: Calico exporter is not collecting metrics or is not available
+          - alert: calico_datapane_failures_high_1h
+            expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
+              summary: 'A high number of dataplane failures within Felix are happening'
+          - alert: calico_datapane_address_msg_batch_size_high_5m
+            expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
+              summary: 'Felix address message batch size is higher'
+          - alert: calico_datapane_iface_msg_batch_size_high_5m
+            expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
+              summary: 'Felix interface message batch size is higher'
+          - alert: calico_ipset_errors_high_1h
+            expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
+              summary: 'A high number of ipset errors within Felix are happening'
+          - alert: calico_iptable_save_errors_high_1h
+            expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
+              summary: 'A high number of iptable save errors within Felix are happening'
+          - alert: calico_iptable_restore_errors_high_1h
+            expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
+            labels:
+              severity: page
+            annotations:
+              description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
+              summary: 'A high number of iptable restore errors within Felix are happening'
+        - name: etcd3.rules
+          rules:
+          - alert: etcd_InsufficientMembers
+            expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+            for: 3m
+            labels:
+              severity: critical
+            annotations:
+              description: If one more etcd member goes down the cluster will be unavailable
+              summary: etcd cluster insufficient members
+          - alert: etcd_NoLeader
+            expr: etcd_server_has_leader{job="etcd"} == 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: etcd member {{ $labels.instance }} has no leader
+              summary: etcd member has no leader
+          - alert: etcd_HighNumberOfLeaderChanges
+            expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+            labels:
+              severity: warning
+            annotations:
+              description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
+              summary: a high number of leader changes within the etcd cluster are happening
+          - alert: etcd_HighNumberOfFailedGRPCRequests
+            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+              summary: a high number of gRPC requests are failing
+          - alert: etcd_HighNumberOfFailedGRPCRequests
+            expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+              summary: a high number of gRPC requests are failing
+          - alert: etcd_GRPCRequestsSlow
+            expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+            for: 10m
+            labels:
+              severity: critical
+            annotations:
+              description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
+              summary: slow gRPC requests
+          - alert: etcd_HighNumberOfFailedHTTPRequests
+            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+              summary: a high number of HTTP requests are failing
+          - alert: etcd_HighNumberOfFailedHTTPRequests
+            expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+              summary: a high number of HTTP requests are failing
+          - alert: etcd_HTTPRequestsSlow
+            expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
+              summary: slow HTTP requests
+          - alert: etcd_EtcdMemberCommunicationSlow
+            expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
+              summary: etcd member communication is slow
+          - alert: etcd_HighNumberOfFailedProposals
+            expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+            labels:
+              severity: warning
+            annotations:
+              description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
+              summary: a high number of proposals within the etcd cluster are failing
+          - alert: etcd_HighFsyncDurations
+            expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: etcd instance {{ $labels.instance }} fync durations are high
+              summary: high fsync durations
+          - alert: etcd_HighCommitDurations
+            expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: etcd instance {{ $labels.instance }} commit durations are high
+              summary: high commit durations
+        - name: kubelet.rules
+          rules:
+          - alert: K8SNodeNotReady
+            expr: kube_node_status_condition{condition="Ready", status="unknown"} == 1 or kube_node_status_condition{condition="Ready", status="false"} == 1
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than a minute
+              summary: '{{ $labels.node }} Node status is NotReady and {{ $labels.status }}'
+          - alert: K8SManyNodesNotReady
+            expr: count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) / count(kube_node_status_condition{condition="Ready", status="unknown"})) > 0.2
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
+              summary: Many Kubernetes nodes are Not Ready
+          - alert: K8SManyNodesNotReady
+            expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 1 and (count(kube_node_status_condition{condition="Ready", status="false"} == 1) / count(kube_node_status_condition{condition="Ready", status="false"})) > 0.2
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
+              summary: Many Kubernetes nodes are Not Ready
+          - alert: K8SNodesNotReady
+            expr: count(kube_node_status_condition{condition="Ready", status="false"} == 1) > 0 or count(kube_node_status_condition{condition="Ready", status="unknown"} == 1) > 0
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: '{{ $value }} nodes are notReady state.'
+              summary: One or more Kubernetes nodes are Not Ready
+          - alert: K8SKubeletDown
+            expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: Prometheus failed to scrape {{ $value }}% of kubelets.
+              summary: Many Kubelets cannot be scraped
+          - alert: K8SKubeletDown
+            expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
+            for: 1m
+            labels:
+              severity: critical
+            annotations:
+              description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
+              summary: Many Kubelets cannot be scraped
+          - alert: K8SKubeletTooManyPods
+            expr: kubelet_running_pod_count > 100
+            labels:
+              severity: warning
+            annotations:
+              description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
+              summary: Kubelet is close to pod limit
+        - name: kube-apiserver.rules
+          rules:
+          - alert: K8SApiserverDown
+            expr: absent(up{job="apiserver"} == 1)
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
+              summary: API server unreachable
+          - alert: K8SApiServerLatency
+            expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
+              summary: Kubernetes apiserver latency is high
+        - name: kube-controller-manager.rules
+          rules:
+          - alert: K8SControllerManagerDown
+            expr: absent(up{job="kube-controller-manager-discovery"} == 1)
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
+              runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+              summary: Controller manager is down
+        - name: kubernetes-object.rules
+          rules:
+          - alert: prom_exporter_kube_state_metrics_unavailable
+            expr: absent(kube_node_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: kube-state-metrics exporter is not collecting metrics or is not available for past 10 minutes
+              title: kube-state-metrics exporter is not collecting metrics or is not available
+          - alert: kube_statefulset_replicas_unavailable
+            expr: kube_statefulset_status_replicas < kube_statefulset_replicas
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
+              summary: '{{$labels.statefulset}}: has inssuficient replicas.'
+          - alert: daemonsets_misscheduled
+            expr: kube_daemonset_status_number_misscheduled > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
+              summary: 'Daemonsets not scheduled correctly'
+          - alert: daemonsets_not_scheduled
+            expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
+              summary: 'Less than desired number of daemonsets scheduled'
+          - alert: daemonset_pods_unavailable
+            expr: kube_daemonset_status_number_unavailable > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Daemonset {{$labels.daemonset}} currently has pods unavailable'
+              summary: 'Daemonset pods unavailable, due to one of many reasons'
+          - alert: deployment_replicas_unavailable
+            expr: kube_deployment_status_replicas_unavailable > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
+              summary: '{{$labels.deployment}}: has inssuficient replicas.'
+          - alert: rollingupdate_deployment_replica_less_than_spec_max_unavailable
+            expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
+              summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
+          - alert: job_status_failed
+            expr: kube_job_status_failed > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Job {{$labels.exported_job}} is in failed status'
+              summary: '{{$labels.exported_job}} has failed status'
+          - alert: pod_status_pending
+            expr: kube_pod_status_phase{phase="Pending"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
+          - alert: pod_error_image_pull
+            expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: pod_status_error_image_pull_backoff
+            expr: kube_pod_container_status_waiting_reason {reason="ImagePullBackOff"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an ImagePullBackOff error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: pod_error_crash_loop_back_off
+            expr: kube_pod_container_status_waiting_reason {reason="CrashLoopBackOff"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an CrashLoopBackOff  error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: pod_error_config_error
+            expr: kube_pod_container_status_waiting_reason {reason="CreateContainerConfigError"} == 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a CreateContainerConfigError error for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: replicaset_missing_replicas
+            expr:  kube_replicaset_spec_replicas -  kube_replicaset_status_ready_replicas > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
+              summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
+          - alert: pod_container_terminated
+            expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
+              summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+          - alert: volume_claim_capacity_high_utilization
+            expr: 100 * kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
+              summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
diff --git a/prometheus/values_overrides/logging.yaml b/prometheus/values_overrides/logging.yaml
new file mode 100644
index 0000000000..91151ca825
--- /dev/null
+++ b/prometheus/values_overrides/logging.yaml
@@ -0,0 +1,105 @@
+conf:
+  prometheus:
+    rules:
+      logging:
+        groups:
+        - name: fluentd.rules
+          rules:
+          - alert: prom_exporter_fluentd_unavailable
+            expr: absent(fluentd_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Fluentd exporter is not collecting metrics or is not available for past 10 minutes
+              title: Fluentd exporter is not collecting metrics or is not available
+          - alert: fluentd_not_running
+            expr:  fluentd_up == 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
+              summary: 'Fluentd is down'
+        - name: elasticsearch.rules
+          rules:
+          - alert: prom_exporter_elasticsearch_unavailable
+            expr: absent(elasticsearch_cluster_health_status)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Elasticsearch exporter is not collecting metrics or is not available for past 10 minutes
+              title: Elasticsearch exporter is not collecting metrics or is not available
+          - alert: es_high_process_open_files_count
+            expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
+              summary: 'Elasticsearch has a very high process open file count.'
+          - alert: es_high_process_cpu_percent
+            expr: elasticsearch_process_cpu_percent > 95
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
+              summary: 'Elasticsearch process cpu usage is more than 95 percent.'
+          - alert: es_fs_usage_high
+            expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
+              summary: 'Elasticsearch filesystem usage is high.'
+          - alert: es_unassigned_shards
+            expr: elasticsearch_cluster_health_unassigned_shards > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch has {{ $value }} unassigned shards.'
+              summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
+          - alert: es_cluster_health_timed_out
+            expr: elasticsearch_cluster_health_timed_out > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
+              summary: 'Elasticsearch cluster health status calls are timing out.'
+          - alert: es_cluster_health_status_alert
+            expr: (sum(elasticsearch_cluster_health_status{color="green"})*2)+sum(elasticsearch_cluster_health_status{color="yellow"}) < 2
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Elasticsearch cluster health status is {{ $value }}, not 2 (green). One or more shards or replicas are unallocated.'
+              summary: 'Elasticsearch cluster health status is not green.'
+          - alert: es_cluster_health_too_few_nodes_running
+            expr: elasticsearch_cluster_health_number_of_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
+              summary: 'ElasticSearch running on less than 3 nodes'
+          - alert: es_cluster_health_too_few_data_nodes_running
+            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+              summary: 'ElasticSearch running on less than 3 data nodes'
+          - alert: es_cluster_health_too_few_data_nodes_running
+            expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+              summary: 'ElasticSearch running on less than 3 data nodes'
diff --git a/prometheus/values_overrides/nodes.yaml b/prometheus/values_overrides/nodes.yaml
new file mode 100644
index 0000000000..dbde760755
--- /dev/null
+++ b/prometheus/values_overrides/nodes.yaml
@@ -0,0 +1,240 @@
+conf:
+  prometheus:
+    rules:
+      nodes:
+        groups:
+        - name: nodes.rules
+          rules:
+          - alert: prom_exporter_node_unavailable
+            expr: absent(node_uname_info)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: node exporter is not collecting metrics or is not available for past 10 minutes
+              title: node exporter is not collecting metrics or is not available
+          - alert: node_filesystem_full_80percent
+            expr: sort(node_filesystem_free{fstype =~ "xfs|ext[34]"} < node_filesystem_size{fstype =~ "xfs|ext[34]"}
+              * 0.2) / 1024 ^ 3
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                got less than 10% space left on its filesystem.'
+              summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
+          - alert: node_filesystem_full_in_4h
+            expr: predict_linear(node_filesystem_free{fstype =~ "xfs|ext[34]"}[1h], 4 * 3600) <= 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+                is running out of space of in approx. 4 hours'
+              summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
+          - alert: node_filedescriptors_full_in_3h
+            expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
+            for: 20m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is running out of available file descriptors
+                in approx. 3 hours'
+              summary: '{{$labels.alias}} is running out of available file descriptors in
+                3 hours.'
+          - alert: node_load1_90percent
+            expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
+            for: 1h
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is running with > 90% total load for at least
+                1h.'
+              summary: '{{$labels.alias}}: Running on high load.'
+          - alert: node_cpu_util_90percent
+            expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
+            for: 1h
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has total CPU utilization over 90% for at least
+                1h.'
+              summary: '{{$labels.alias}}: High CPU utilization.'
+          - alert: node_ram_using_90percent
+            expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
+              * 0.1
+            for: 30m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is using at least 90% of its RAM for at least
+                30 minutes now.'
+              summary: '{{$labels.alias}}: Using lots of RAM.'
+          - alert: node_swap_using_80percent
+            expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
+              > node_memory_SwapTotal * 0.8
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} is using 80% of its swap space for at least
+                10 minutes now.'
+              summary: '{{$labels.alias}}: Running out of swap soon.'
+          - alert: node_high_cpu_load
+            expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
+              summary: '{{$labels.alias}}: Running on high load: {{$value}}'
+          - alert: node_high_memory_load
+            expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+              + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: Host memory usage is {{ humanize $value }}%. Reported by
+                instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server memory is almost full
+          - alert: node_high_storage_load
+            expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
+              / node_filesystem_size{mountpoint="/"} * 100 > 85
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host storage usage is {{ humanize $value }}%. Reported by
+                instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server storage is almost full
+          - alert: node_high_swap
+            expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
+              * 0.4)
+            for: 1m
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has a high swap usage of {{ humanize $value }}. Reported
+                by instance {{ $labels.instance }} of job {{ $labels.job }}.
+              summary: Server has a high swap usage
+          - alert: node_high_network_drop_rcv
+            expr: node_network_receive_drop{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high drop in network reception ({{
+                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                $labels.job }}
+              summary: Server has a high receive drop
+          - alert: node_high_network_drop_send
+            expr: node_network_transmit_drop{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high drop in network transmission ({{
+                humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+                $labels.job }}
+              summary: Server has a high transmit drop
+          - alert: node_high_network_errs_rcv
+            expr: node_network_receive_errs{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high error rate in network reception
+                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                {{ $labels.job }}
+              summary: Server has unusual high reception errors
+          - alert: node_high_network_errs_send
+            expr: node_network_transmit_errs{device!="lo"} > 3000
+            for: 30s
+            labels:
+              severity: warning
+            annotations:
+              description: Host system has an unusally high error rate in network transmission
+                ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+                {{ $labels.job }}
+              summary: Server has unusual high transmission errors
+          - alert: node_network_conntrack_usage_80percent
+            expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"}  * 0.8)
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
+              summary: '{{$labels.instance}}: available network conntrack entries are low.'
+          - alert: node_entropy_available_low
+            expr: node_entropy_available_bits < 300
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
+              summary: '{{$labels.instance}}: is low on entropy bits.'
+          - alert: node_hwmon_high_cpu_temp
+            expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
+              summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
+          - alert: node_vmstat_paging_rate_high
+            expr: irate(node_vmstat_pgpgin[5m]) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
+              summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
+          - alert: node_xfs_block_allocation_high
+            expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
+              summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
+          - alert: node_network_bond_slaves_down
+            expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
+              summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
+          - alert: node_numa_memory_used
+            expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
+              summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
+          - alert: node_ntp_clock_skew_high
+            expr: abs(node_ntp_drift_seconds) > 2
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
+              summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
+          - alert: node_disk_read_latency
+            expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 40
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.device}} has a high read latency of {{ $value }}'
+              summary: 'High read latency observed for device {{ $labels.device }}'
+          - alert: node_disk_write_latency
+            expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 40
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: '{{$labels.device}} has a high write latency of {{ $value }}'
+              summary: 'High write latency observed for device {{ $labels.device }}'
diff --git a/prometheus/values_overrides/openstack.yaml b/prometheus/values_overrides/openstack.yaml
new file mode 100644
index 0000000000..4c38a6a5d5
--- /dev/null
+++ b/prometheus/values_overrides/openstack.yaml
@@ -0,0 +1,315 @@
+conf:
+  prometheus:
+    rules:
+      openstack:
+        groups:
+        - name: mariadb.rules
+          rules:
+          - alert: prom_exporter_mariadb_unavailable
+            expr: absent(mysql_up)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: MariaDB exporter is not collecting metrics or is not available for past 10 minutes
+              title: MariaDB exporter is not collecting metrics or is not available
+          - alert: mariadb_table_lock_wait_high
+            expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'Mariadb has high table lock waits of {{ $value }} percentage'
+              summary: 'Mariadb table lock waits are high'
+          - alert: mariadb_node_not_ready
+            expr:  mysql_global_status_wsrep_ready != 1
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
+              summary: 'Galera cluster node not ready'
+          - alert: mariadb_galera_node_out_of_sync
+            expr:  mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
+              summary: 'Galera cluster node out of sync'
+          - alert: mariadb_innodb_replication_fallen_behind
+            expr:  (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'The mysql innodb replication has fallen behind and is not recovering'
+              summary: 'MySQL innodb replication is lagging'
+        - name: openstack.rules
+          rules:
+          - alert: prom_exporter_openstack_unavailable
+            expr: absent(openstack_exporter_cache_refresh_duration_seconds)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: Openstack exporter is not collecting metrics or is not available for past 10 minutes
+              title: Openstack exporter is not collecting metrics or is not available
+          - alert: os_glance_api_availability
+            expr:  openstack_check_glance_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Glance API is not available at {{$labels.url}}'
+          - alert: os_nova_api_availability
+            expr:  openstack_check_nova_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Nova API is not available at {{$labels.url}}'
+          - alert: os_keystone_api_availability
+            expr:  openstack_check_keystone_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Keystone API is not available at {{$labels.url}}'
+          - alert: os_neutron_api_availability
+            expr:  openstack_check_neutron_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Neutron API is not available at {{$labels.url}}'
+          - alert: os_neutron_metadata_agent_availability
+            expr:  openstack_services_neutron_metadata_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron metadata_agents are not available for more than 5 minutes'
+              summary: 'One or more neutron metadata_agents are not available'
+          - alert: os_neutron_openvswitch_agent_availability
+            expr:  openstack_services_neutron_openvswitch_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron openvswitch agents are not available for more than 5 minutes'
+              summary: 'One or more neutron openvswitch agents are not available'
+          - alert: os_neutron_dhcp_agent_availability
+            expr:  openstack_services_neutron_dhcp_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron dhcp agents are not available for more than 5 minutes'
+              summary: 'One or more neutron dhcp agents are not available'
+          - alert: os_neutron_l3_agent_availability
+            expr:  openstack_services_neutron_l3_agent_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'One or more neutron L3 agents are not available for more than 5 minutes'
+              summary: 'One or more neutron L3 agents are not available'
+          - alert: os_swift_api_availability
+            expr:  openstack_check_swift_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Swift API is not available at {{$labels.url}}'
+          - alert: os_cinder_api_availability
+            expr:  openstack_check_cinder_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Cinder API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Cinder API is not available at {{$labels.url}}'
+          - alert: os_cinder_scheduler_availability
+            expr:  openstack_services_cinder_cinder_scheduler != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Cinder scheduler is not available for more than 5 minutes'
+              summary: 'Cinder scheduler is not available'
+          - alert: os_heat_api_availability
+            expr:  openstack_check_heat_api != 1
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Heat API is not available at {{$labels.url}} for more than 5 minutes'
+              summary: 'Heat API is not available at {{$labels.url}}'
+          - alert: os_nova_compute_disabled
+            expr:  openstack_services_nova_compute_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is disabled on some hosts'
+          - alert: os_nova_conductor_disabled
+            expr:  openstack_services_nova_conductor_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is disabled on some hosts'
+          - alert: os_nova_consoleauth_disabled
+            expr:  openstack_services_nova_consoleauth_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
+          - alert: os_nova_scheduler_disabled
+            expr:  openstack_services_nova_scheduler_disabled_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+          - alert: os_nova_compute_down
+            expr:  openstack_services_nova_compute_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-compute is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-compute is down on some hosts'
+          - alert: os_nova_conductor_down
+            expr:  openstack_services_nova_conductor_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-conductor is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-conductor is down on some hosts'
+          - alert: os_nova_consoleauth_down
+            expr:  openstack_services_nova_consoleauth_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-consoleauth is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-consoleauth is down on some hosts'
+          - alert: os_nova_scheduler_down
+            expr:  openstack_services_nova_scheduler_down_total > 0
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'nova-scheduler is down on certain hosts for more than 5 minutes'
+              summary: 'Openstack compute service nova-scheduler is down on some hosts'
+          - alert: os_vm_vcpu_usage_high
+            expr: openstack_total_used_vcpus * 100/(openstack_total_used_vcpus + openstack_total_free_vcpus) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM vcpu usage is hight at {{$value}} percent'
+              summary: 'Openstack VM vcpu usage is high'
+          - alert: os_vm_ram_usage_high
+            expr: openstack_total_used_ram_MB * 100/(openstack_total_used_ram_MB + openstack_total_free_ram_MB) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM RAM usage is hight at {{$value}} percent'
+              summary: 'Openstack VM RAM usage is high'
+          - alert: os_vm_disk_usage_high
+            expr: openstack_total_used_disk_GB * 100/ ( openstack_total_used_disk_GB + openstack_total_free_disk_GB ) > 80
+            for: 5m
+            labels:
+              severity: page
+            annotations:
+              description: 'Openstack VM Disk usage is hight at {{$value}} percent'
+              summary: 'Openstack VM Disk usage is high'
+        - name: rabbitmq.rules
+          rules:
+          - alert: rabbitmq_network_pratitions_detected
+            expr: min(partitions) by(instance) > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
+              summary: 'RabbitMQ Network partitions detected'
+          - alert: rabbitmq_down
+            expr:  min(rabbitmq_up) by(instance) != 1
+            for: 10m
+            labels:
+              severity: page
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
+              summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
+          - alert: rabbitmq_file_descriptor_usage_high
+            expr:  fd_used * 100 /fd_total > 80
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
+              summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
+          - alert: rabbitmq_node_disk_free_alarm
+            expr:  node_disk_free_alarm > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
+              summary: 'RabbitMQ disk space usage is high'
+          - alert: rabbitmq_node_memory_alarm
+            expr:  node_mem_alarm > 0
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
+              summary: 'RabbitMQ memory usage is high'
+          - alert: rabbitmq_less_than_3_nodes
+            expr:  running < 3
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server has less than 3 nodes running.'
+              summary: 'RabbitMQ server is at risk of loosing data'
+          - alert: rabbitmq_queue_messages_returned_high
+            expr:  queue_messages_returned_total/queue_messages_published_total * 100 > 50
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
+              summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
+          - alert: rabbitmq_consumers_low_utilization
+            expr:  queue_consumer_utilisation < .4
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ consumers message consumption speed is low'
+              summary: 'RabbitMQ consumers message consumption speed is low'
+          - alert: rabbitmq_high_message_load
+            expr:  queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
+              summary: 'RabbitMQ has high message load'
diff --git a/prometheus/values_overrides/postgresql.yaml b/prometheus/values_overrides/postgresql.yaml
new file mode 100644
index 0000000000..9e83ee92af
--- /dev/null
+++ b/prometheus/values_overrides/postgresql.yaml
@@ -0,0 +1,39 @@
+conf:
+  prometheus:
+    rules:
+      postgresql:
+        groups:
+        - name: postgresql.rules
+          rules:
+          - alert: prom_exporter_postgresql_unavailable
+            expr: absent(pg_static)
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              description: postgresql exporter is not collecting metrics or is not available for past 10 minutes
+              title: postgresql exporter is not collecting metrics or is not available
+          - alert: pg_replication_fallen_behind
+            expr: (pg_replication_lag > 120) and ON(instance) (pg_replication_is_replica ==  1)
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              description: Replication lag on server {{$labels.instance}} is currently {{$value | humanizeDuration }}
+              title: Postgres Replication lag is over 2 minutes
+          - alert: pg_connections_too_high
+            expr: sum(pg_stat_activity_count) BY (environment, fqdn) > ON(fqdn) pg_settings_max_connections * 0.95
+            for: 5m
+            labels:
+              severity: warn
+              channel: database
+            annotations:
+              title: Postgresql has {{$value}} connections on {{$labels.fqdn}} which is close to the maximum
+          - alert: pg_deadlocks_detected
+            expr: sum by(datname) (rate(pg_stat_database_deadlocks[1m])) > 0
+            for: 5m
+            labels:
+              severity: warn
+            annotations:
+              description: postgresql at {{$labels.instance}} is showing {{$value}} rate of deadlocks for database {{$labels.datname}}
+              title: Postgres server is experiencing deadlocks
diff --git a/tools/deployment/federated-monitoring/000-install-packages.sh b/tools/deployment/federated-monitoring/000-install-packages.sh
new file mode 120000
index 0000000000..d702c48993
--- /dev/null
+++ b/tools/deployment/federated-monitoring/000-install-packages.sh
@@ -0,0 +1 @@
+../common/000-install-packages.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/005-deploy-k8s.sh b/tools/deployment/federated-monitoring/005-deploy-k8s.sh
new file mode 120000
index 0000000000..257a39f7a3
--- /dev/null
+++ b/tools/deployment/federated-monitoring/005-deploy-k8s.sh
@@ -0,0 +1 @@
+../common/005-deploy-k8s.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/010-ingress.sh b/tools/deployment/federated-monitoring/010-ingress.sh
new file mode 120000
index 0000000000..94b1e92f92
--- /dev/null
+++ b/tools/deployment/federated-monitoring/010-ingress.sh
@@ -0,0 +1 @@
+../common/020-ingress.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/020-nfs-provisioner.sh b/tools/deployment/federated-monitoring/020-nfs-provisioner.sh
new file mode 120000
index 0000000000..2d0231b7fb
--- /dev/null
+++ b/tools/deployment/federated-monitoring/020-nfs-provisioner.sh
@@ -0,0 +1 @@
+../osh-infra-monitoring/030-nfs-provisioner.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/030-ldap.sh b/tools/deployment/federated-monitoring/030-ldap.sh
new file mode 120000
index 0000000000..4ed4b9d4b4
--- /dev/null
+++ b/tools/deployment/federated-monitoring/030-ldap.sh
@@ -0,0 +1 @@
+../common/040-ldap.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/040-kube-state-metrics.sh b/tools/deployment/federated-monitoring/040-kube-state-metrics.sh
new file mode 120000
index 0000000000..2a18ebb8b5
--- /dev/null
+++ b/tools/deployment/federated-monitoring/040-kube-state-metrics.sh
@@ -0,0 +1 @@
+../common/070-kube-state-metrics.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/050-node-exporter.sh b/tools/deployment/federated-monitoring/050-node-exporter.sh
new file mode 120000
index 0000000000..412748a74d
--- /dev/null
+++ b/tools/deployment/federated-monitoring/050-node-exporter.sh
@@ -0,0 +1 @@
+../common/080-node-exporter.sh
\ No newline at end of file
diff --git a/tools/deployment/federated-monitoring/060-prometheus.sh b/tools/deployment/federated-monitoring/060-prometheus.sh
new file mode 100755
index 0000000000..fd5ded9b26
--- /dev/null
+++ b/tools/deployment/federated-monitoring/060-prometheus.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2017 The Openstack-Helm Authors.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -xe
+
+#NOTE: Lint and package chart
+make prometheus
+
+tee /tmp/prometheus-one.yaml << EOF
+endpoints:
+  monitoring:
+    hosts:
+      default: prom-metrics-one
+      public: prometheus-one
+manifests:
+  network_policy: false
+EOF
+
+tee /tmp/prometheus-two.yaml << EOF
+endpoints:
+  monitoring:
+    hosts:
+      default: prom-metrics-two
+      public: prometheus-two
+manifests:
+  network_policy: false
+EOF
+
+tee /tmp/prometheus-three.yaml << EOF
+endpoints:
+  monitoring:
+    hosts:
+      default: prom-metrics-three
+      public: prometheus-three
+manifests:
+  network_policy: false
+EOF
+#NOTE: Deploy command
+for release in prometheus-one prometheus-two prometheus-three; do
+  rules_overrides=""
+  for rules_file in $(ls ./prometheus/values_overrides); do
+    rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
+  done
+  helm upgrade --install prometheus-$release ./prometheus \
+      --namespace=osh-infra \
+      --values=/tmp/$release.yaml \
+      $rules_overrides
+      #NOTE: Wait for deploy
+      ./tools/deployment/common/wait-for-pods.sh osh-infra
+
+      #NOTE: Validate Deployment info
+      helm status prometheus-$release
+
+      helm test prometheus-$release
+done
diff --git a/tools/deployment/federated-monitoring/070-federated-prometheus.sh b/tools/deployment/federated-monitoring/070-federated-prometheus.sh
new file mode 100755
index 0000000000..2eb600e727
--- /dev/null
+++ b/tools/deployment/federated-monitoring/070-federated-prometheus.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 The Openstack-Helm Authors.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -xe
+
+tee /tmp/federated-prometheus.yaml << EOF
+endpoints:
+  monitoring:
+    hosts:
+      default: prom-metrics-federate
+      public: prometheus-federate
+manifests:
+  network_policy: false
+conf:
+  prometheus:
+    scrape_configs:
+      template: |
+        global:
+          scrape_interval: 60s
+          evaluation_interval: 60s
+        scrape_configs:
+        - job_name: 'federate'
+          scrape_interval: 15s
+
+          honor_labels: true
+          metrics_path: '/federate'
+
+          params:
+            'match[]':
+              - '{__name__=~".+"}'
+
+          static_configs:
+            - targets:
+              - 'prometheus-one.osh-infra.svc.cluster.local:80'
+              - 'prometheus-two.osh-infra.svc.cluster.local:80'
+              - 'prometheus-three.osh-infra.svc.cluster.local:80'
+EOF
+
+#NOTE: Lint and package chart
+make prometheus
+
+#NOTE: Deploy command
+helm upgrade --install federated-prometheus ./prometheus \
+    --namespace=osh-infra \
+    --values=/tmp/federated-prometheus.yaml
+
+#NOTE: Wait for deploy
+./tools/deployment/common/wait-for-pods.sh osh-infra
+
+#NOTE: Validate Deployment info
+helm status federated-prometheus
+
+helm test federated-prometheus
diff --git a/tools/deployment/federated-monitoring/100-prometheus-selenium.sh b/tools/deployment/federated-monitoring/100-prometheus-selenium.sh
new file mode 100755
index 0000000000..545397f525
--- /dev/null
+++ b/tools/deployment/federated-monitoring/100-prometheus-selenium.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -xe
+
+export CHROMEDRIVER="${CHROMEDRIVER:="/etc/selenium/chromedriver"}"
+export ARTIFACTS_DIR="${ARTIFACTS_DIR:="/tmp/artifacts/"}"
+
+export PROMETHEUS_USER="admin"
+export PROMETHEUS_PASSWORD="changeme"
+
+export PROMETHEUS_URI="prometheus-one.osh-infra.svc.cluster.local"
+python3 tools/gate/selenium/prometheusSelenium.py
+mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_One_Command_Line_Flags.png
+mv ${ARTIFACTS_DIR}Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_One_Dashboard.png
+mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_One_Runtime_Info.png
+
+export PROMETHEUS_URI="prometheus-two.osh-infra.svc.cluster.local"
+python3 tools/gate/selenium/prometheusSelenium.py
+mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Two_Command_Line_Flags.png
+mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Two_Dashboard.png
+mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Two_Runtime_Info.png
+
+export PROMETHEUS_URI="prometheus-three.osh-infra.svc.cluster.local"
+python3 tools/gate/selenium/prometheusSelenium.py
+mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Three_Command_Line_Flags.png
+mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Three_Dashboard.png
+mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Three_Runtime_Info.png
+
+export PROMETHEUS_URI="prometheus-federate.osh-infra.svc.cluster.local"
+python3 tools/gate/selenium/prometheusSelenium.py
+mv ${ARTIFACTS_DIR}/Prometheus_Command_Line_Flags.png ${ARTIFACTS_DIR}/Prometheus_Federated_Command_Line_Flags.png
+mv ${ARTIFACTS_DIR}/Prometheus_Dashboard.png ${ARTIFACTS_DIR}/Prometheus_Federated_Dashboard.png
+mv ${ARTIFACTS_DIR}/Prometheus_Runtime_Info.png ${ARTIFACTS_DIR}/Prometheus_Federated_Runtime_Info.png
diff --git a/tools/deployment/osh-infra-monitoring/050-prometheus.sh b/tools/deployment/osh-infra-monitoring/050-prometheus.sh
index 4c2edb2ebc..9865c421c5 100755
--- a/tools/deployment/osh-infra-monitoring/050-prometheus.sh
+++ b/tools/deployment/osh-infra-monitoring/050-prometheus.sh
@@ -19,9 +19,15 @@ set -xe
 #NOTE: Lint and package chart
 make prometheus
 
+rules_overrides=""
+for rules_file in $(ls ./prometheus/values_overrides); do
+  rules_overrides="$rules_overrides --values=./prometheus/values_overrides/$rules_file"
+done
+
 #NOTE: Deploy command
 helm upgrade --install prometheus ./prometheus \
-    --namespace=osh-infra
+    --namespace=osh-infra \
+    $rules_overrides
 
 #NOTE: Wait for deploy
 ./tools/deployment/common/wait-for-pods.sh osh-infra
diff --git a/zuul.d/jobs.yaml b/zuul.d/jobs.yaml
index d317b9c688..415c6a1b94 100644
--- a/zuul.d/jobs.yaml
+++ b/zuul.d/jobs.yaml
@@ -169,6 +169,29 @@
         - ./tools/deployment/osh-infra-monitoring/610-prometheus-selenium.sh || true
         - ./tools/deployment/osh-infra-monitoring/620-nagios-selenium.sh || true
 
+- job:
+    name: openstack-helm-infra-federated-monitoring
+    parent: openstack-helm-infra-functional
+    timeout: 7200
+    pre-run:
+      - playbooks/osh-infra-upgrade-host.yaml
+      - playbooks/osh-infra-deploy-selenium.yaml
+    run: playbooks/osh-infra-gate-runner.yaml
+    post-run: playbooks/osh-infra-collect-logs.yaml
+    nodeset: openstack-helm-single-node
+    vars:
+      gate_scripts:
+        - ./tools/deployment/federated-monitoring/000-install-packages.sh
+        - ./tools/deployment/federated-monitoring/005-deploy-k8s.sh
+        - ./tools/deployment/federated-monitoring/010-ingress.sh
+        - ./tools/deployment/federated-monitoring/020-nfs-provisioner.sh
+        - ./tools/deployment/federated-monitoring/030-ldap.sh
+        - ./tools/deployment/federated-monitoring/040-kube-state-metrics.sh
+        - ./tools/deployment/federated-monitoring/050-node-exporter.sh
+        - ./tools/deployment/federated-monitoring/060-prometheus.sh
+        - ./tools/deployment/federated-monitoring/070-federated-prometheus.sh
+        - ./tools/deployment/federated-monitoring/100-prometheus-selenium.sh || true
+
 - job:
     name: openstack-helm-infra-aio-network-policy
     parent: openstack-helm-infra-functional
diff --git a/zuul.d/project.yaml b/zuul.d/project.yaml
index 2d76ace302..575f564755 100644
--- a/zuul.d/project.yaml
+++ b/zuul.d/project.yaml
@@ -21,6 +21,8 @@
         - openstack-helm-lint
         - openstack-helm-infra-aio-logging
         - openstack-helm-infra-aio-monitoring
+        - openstack-helm-infra-federated-monitoring:
+            voting: false
         - openstack-helm-infra-aio-network-policy:
             voting: false
         - openstack-helm-infra-openstack-support