diff --git a/charts/cluster-addons/.helmignore b/charts/cluster-addons/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/cluster-addons/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/cluster-addons/Chart.yaml b/charts/cluster-addons/Chart.yaml new file mode 100644 index 0000000..a5d845c --- /dev/null +++ b/charts/cluster-addons/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: cluster-addons +description: Helm chart that deploys cluster addons for a CAPI cluster. +type: application +version: 0.1.0 +appVersion: main diff --git a/charts/cluster-addons/README.md b/charts/cluster-addons/README.md new file mode 100644 index 0000000..aee49f3 --- /dev/null +++ b/charts/cluster-addons/README.md @@ -0,0 +1,165 @@ +# cluster-addons chart + +This [Helm chart](https://helm.sh/) manages the deployment of addons for a +[Kubernetes](https://kubernetes.io) cluster deployed using +[Cluster API](https://cluster-api.sigs.k8s.io/). It is a dependency of the cluster management +charts from this repository, e.g. [openstack-cluster](../openstack-cluster). + +Addons are managed using custom resources provided by the +[Cluster API Addon Provider](https://github.com/stackhpc/cluster-api-addon-provider), +which must be installed. Please also read the documentation for the addon provider to +see how addons are defined. + +## Contents + +- [Container Network Interface (CNI) plugins](#container-network-interface-cni-plugins) +- [OpenStack integrations](#openstack-integrations) +- [Ingress controllers](#ingress-controllers) +- [Metrics server](#metrics-server) +- [Monitoring and logging](#monitoring-and-logging) + +## Container Network Interface (CNI) plugins + +This chart can install either [Calico](https://docs.projectcalico.org/about/about-calico) or +[Cilium](https://cilium.io/) as a +[CNI plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/) +to provide the pod networking in a Kubernetes cluster. By default, the Calico CNI will be +installed. + +To switch the CNI to Cilium, use the following in your Helm values: + +```yaml +cni: + type: cilium +``` + +And to disable the installation of a CNI completely: + +```yaml +cni: + enabled: false +``` + +Additional configuration options are available for each - see [values.yaml](./values.yaml). + +## OpenStack integrations + +Kubernetes allows cloud providers to provide various plugins to integrate with the +underlying infrastructure, for example +[Cloud Controller Managers (CCMs)](https://kubernetes.io/docs/concepts/architecture/cloud-controller/), +[Container Storage Interface (CSI) implementations](https://kubernetes-csi.github.io/docs/) +and [authenticating webhooks](https://kubernetes.io/docs/reference/access-authn-authz/webhook/). + +This chart is able to deploy the CCM and the Cinder CSI plugin from the +[Kubernetes OpenStack cloud provider](https://github.com/kubernetes/cloud-provider-openstack), +which allows your Kubernetes cluster to integrate with the OpenStack cloud on which it is deployed. +This enables features like automatic labelling of nodes with OpenStack information (e.g. server ID +and flavor), automatic configuration of hostnames and IP addresses, managed load balancers for +services and dynamic provisioning of RWO volumes. + +By default, the OpenStack integrations are not enabled. To enable OpenStack integrations on the +target cluster, use the following in your Helm values: + +```yaml +openstack: + enabled: true +``` + +To configure options for the `[Networking]`, `[LoadBalancer]`, `[BlockStorage]` and `[Metadata]` +sections of the cloud-config file, you can use Helm values, e.g.: + +```yaml +openstack: + cloudConfig: + Networking: + public-network-name: public-internet + LoadBalancer: + lb-method: LEAST_CONNECTIONS + create-monitor: true + BlockStorage: + ignore-volume-az: true + Metadata: + search-order: metadataService +``` + +The `[Globals]` section is populated such that the credential used by the `OpenStackCluster` +object is also used by OpenStack integration on the cluster. + +For the available options, consult the documentation for the +[CCM](https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/openstack-cloud-controller-manager/using-openstack-cloud-controller-manager.md#config-openstack-cloud-controller-manager) +and the +[Cinder CSI plugin](https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/cinder-csi-plugin/using-cinder-csi-plugin.md#block-storage). + +Additional configuration options are available for the OpenStack integrations - see +[values.yaml](./values.yaml) for more details. + +## Ingress controllers + +Running an +[Ingress Controller](https://kubernetes.io/docs/concepts/services-networking/ingress-controllers/) +on your Kubernetes cluster enables the use of +[Ingress resource](https://kubernetes.io/docs/concepts/services-networking/ingress/) +to manage HTTP(S) traffic flowing in and out of the cluster. This allows your web applications +to take advantage of load-balancing, name-based virtual hosting, path-based routing and +TLS termination using the same declarative approach as other Kubernetes resources. +When combined with a cert-manager issuer (see above) this provides an almost frictionless way +to secure your web services. + +It is possible to install multiple Ingress Controllers and select the preferred one for a +particular Ingress resource using +[Ingress Classes](https://kubernetes.io/docs/concepts/services-networking/ingress/#ingress-class). + +This chart can install the [Nginx Ingress Controller](https://kubernetes.github.io/ingress-nginx/) +onto the target cluster. + +The Nginx Ingress Controller is disabled by default. To enable it, use the following Helm values: + +```yaml +ingress: + enabled: true +``` + +## Metrics server + +In order to use features like `kubectl top` to observe resource usage, and also to use +[Horizontal Pod Autoscalers](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/), +the [metrics server](https://github.com/kubernetes-sigs/metrics-server) must be installed. + +This chart is able to install the metrics server, and it is enabled by default. To disable +it, use the following Helm values: + +```yaml +metricsServer: + enabled: false +``` + +## Monitoring and logging + +This chart is able to deploy a monitoring and logging stack using +[Prometheus](https://prometheus.io/), [Grafana](https://grafana.com/) and +[Loki](https://github.com/grafana/loki). + +The monitoring stack is installed using the +[kube-prometheus-stack chart](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack), +which makes sure many useful exporters are installed and dashboards available for them. +It also configures alerts for the cluster, *but does not configure any alert sinks by default*. + +Loki is installed using the +[loki-stack](https://github.com/grafana/helm-charts/tree/main/charts/loki-stack) chart, +that also installs and configures [promtail](https://grafana.com/docs/loki/latest/clients/promtail/) +to ship logs to Loki. A simple dashboard is installed into the Grafana provided by +`kube-prometheus-stack` to make the logs available for browsing. + +The monitoring stack is not enabled by default. To enable it, use the following Helm values: + +```yaml +monitoring: + enabled: true +``` + +By default, Grafana is only available from within the cluster and must be accessed using +[port forwarding](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/): + +```sh +kubectl -n monitoring-system port-forward svc/kube-prometheus-stack-grafana 3000:80 +``` diff --git a/charts/openstack-cluster/grafana-dashboards/loki-dashboard.json b/charts/cluster-addons/grafana-dashboards/loki-dashboard.json similarity index 100% rename from charts/openstack-cluster/grafana-dashboards/loki-dashboard.json rename to charts/cluster-addons/grafana-dashboards/loki-dashboard.json diff --git a/charts/openstack-cluster/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json b/charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json similarity index 100% rename from charts/openstack-cluster/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json rename to charts/cluster-addons/grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json diff --git a/charts/cluster-addons/templates/_helpers.tpl b/charts/cluster-addons/templates/_helpers.tpl new file mode 100644 index 0000000..db0a54b --- /dev/null +++ b/charts/cluster-addons/templates/_helpers.tpl @@ -0,0 +1,39 @@ +{{/* +The name of the target cluster. +*/}} +{{- define "cluster-addons.clusterName" -}} +{{- .Values.clusterName | default .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- end }} + +{{/* +Create a name for a cluster component. +*/}} +{{- define "cluster-addons.componentName" -}} +{{- $ctx := index . 0 -}} +{{- $componentName := index . 1 -}} +{{- printf "%s-%s" $ctx.Release.Name $componentName | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "cluster-addons.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Selector labels for component-level resources +*/}} +{{- define "cluster-addons.componentSelectorLabels" -}} +capi.stackhpc.com/cluster: {{ index . 0 | include "cluster-addons.clusterName" }} +capi.stackhpc.com/component: {{ index . 1 }} +{{- end -}} + +{{/* +Labels for component-level resources +*/}} +{{- define "cluster-addons.componentLabels" -}} +helm.sh/chart: {{ index . 0 | include "cluster-addons.chart" }} +capi.stackhpc.com/managed-by: {{ (index . 0).Release.Service }} +{{ include "cluster-addons.componentSelectorLabels" . }} +{{- end -}} diff --git a/charts/cluster-addons/templates/cni/calico.yaml b/charts/cluster-addons/templates/cni/calico.yaml new file mode 100644 index 0000000..98a24af --- /dev/null +++ b/charts/cluster-addons/templates/cni/calico.yaml @@ -0,0 +1,43 @@ +{{- if and .Values.cni.enabled (eq .Values.cni.type "calico") }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "cni-calico") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + installation: + calicoNetwork: + bgp: Disabled + nodeAddressAutodetectionV4: + kubernetes: NodeInternalIP + ipPools: + {% for cidr in cluster.spec.clusterNetwork.pods.cidrBlocks %} + - cidr: {{ "{{" }} cidr {{ "}}" }} + encapsulation: VXLAN + {% endfor %} + overrides: | + {{- toYaml .Values.cni.calico.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "cni-calico") }} + labels: {{ include "cluster-addons.componentLabels" (list . "cni-calico") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.cni.calico.chart | nindent 4 }} + targetNamespace: {{ .Values.cni.calico.release.namespace }} + releaseName: cni-calico + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "cni-calico") }}-config + key: overrides +{{- end }} diff --git a/charts/cluster-addons/templates/cni/cilium.yaml b/charts/cluster-addons/templates/cni/cilium.yaml new file mode 100644 index 0000000..7747424 --- /dev/null +++ b/charts/cluster-addons/templates/cni/cilium.yaml @@ -0,0 +1,35 @@ +{{- if and .Values.cni.enabled (eq .Values.cni.type "cilium") }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "cni-cilium") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "cni-cilium") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + ipam: + mode: kubernetes + overrides: | + {{- toYaml .Values.cni.cilium.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "cni-cilium") }} + labels: {{ include "cluster-addons.componentLabels" (list . "cni-cilium") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.cni.cilium.chart | nindent 4 }} + targetNamespace: {{ .Values.cni.cilium.release.namespace }} + releaseName: cni-cilium + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "cni-cilium") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "cni-cilium") }}-config + key: overrides +{{- end }} diff --git a/charts/cluster-addons/templates/ingress-nginx.yaml b/charts/cluster-addons/templates/ingress-nginx.yaml new file mode 100644 index 0000000..83d5238 --- /dev/null +++ b/charts/cluster-addons/templates/ingress-nginx.yaml @@ -0,0 +1,29 @@ +{{- if .Values.ingress.nginx.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "ingress-nginx") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + values: | + {{- toYaml .Values.ingress.nginx.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }} + labels: {{ include "cluster-addons.componentLabels" (list . "ingress-nginx") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.ingress.nginx.chart | nindent 4 }} + targetNamespace: {{ .Values.ingress.nginx.release.namespace }} + releaseName: ingress-nginx + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "ingress-nginx") }}-config + key: values +{{- end }} diff --git a/charts/cluster-addons/templates/kubernetes-dashboard.yaml b/charts/cluster-addons/templates/kubernetes-dashboard.yaml new file mode 100644 index 0000000..cb2f7e3 --- /dev/null +++ b/charts/cluster-addons/templates/kubernetes-dashboard.yaml @@ -0,0 +1,36 @@ +{{- if .Values.kubernetesDashboard.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "kubernetes-dashboard") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "kubernetes-dashboard") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + # Enable the metrics scraper by default + defaults: | + metricsScraper: + enabled: true + overrides: | + {{- toYaml .Values.kubernetesDashboard.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "kubernetes-dashboard") }} + labels: {{ include "cluster-addons.componentLabels" (list . "kubernetes-dashboard") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.kubernetesDashboard.chart | nindent 4 }} + targetNamespace: {{ .Values.kubernetesDashboard.release.namespace }} + releaseName: kubernetes-dashboard + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "kubernetes-dashboard") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "kubernetes-dashboard") }}-config + key: overrides +{{- end }} diff --git a/charts/openstack-cluster/templates/addons/mellanox-network-operator.yaml b/charts/cluster-addons/templates/mellanox-network-operator.yaml similarity index 52% rename from charts/openstack-cluster/templates/addons/mellanox-network-operator.yaml rename to charts/cluster-addons/templates/mellanox-network-operator.yaml index fdc7c7a..d338e8f 100644 --- a/charts/openstack-cluster/templates/addons/mellanox-network-operator.yaml +++ b/charts/cluster-addons/templates/mellanox-network-operator.yaml @@ -1,12 +1,12 @@ -{{- if and .Values.addons.enabled .Values.addons.mellanoxNetworkOperator.enabled }} +{{- if .Values.mellanoxNetworkOperator.enabled }} --- apiVersion: v1 kind: Secret metadata: - name: {{ include "openstack-cluster.componentName" (list . "mellanox-network-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "mellanox-network-operator") }}-config labels: - {{- include "openstack-cluster.componentLabels" (list . "mellanox-network-operator") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" + {{- include "cluster-addons.componentLabels" (list . "mellanox-network-operator") | nindent 4 }} + addons.stackhpc.com/watch: "" stringData: defaults: | # Use the shared NFD @@ -35,24 +35,24 @@ stringData: secondaryNetwork: deploy: false overrides: | - {{- toYaml .Values.addons.mellanoxNetworkOperator.release.values | nindent 4 }} + {{- toYaml .Values.mellanoxNetworkOperator.release.values | nindent 4 }} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: HelmRelease metadata: - name: {{ include "openstack-cluster.componentName" (list . "mellanox-network-operator") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "mellanox-network-operator") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "mellanox-network-operator") }} + labels: {{ include "cluster-addons.componentLabels" (list . "mellanox-network-operator") | nindent 4 }} spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} + clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - chart: {{ toYaml .Values.addons.mellanoxNetworkOperator.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.mellanoxNetworkOperator.release.namespace }} + chart: {{ toYaml .Values.mellanoxNetworkOperator.chart | nindent 4 }} + targetNamespace: {{ .Values.mellanoxNetworkOperator.release.namespace }} releaseName: mellanox-network-operator valuesSources: - secret: - name: {{ include "openstack-cluster.componentName" (list . "mellanox-network-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "mellanox-network-operator") }}-config key: defaults - secret: - name: {{ include "openstack-cluster.componentName" (list . "mellanox-network-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "mellanox-network-operator") }}-config key: overrides {{- end }} diff --git a/charts/cluster-addons/templates/metrics-server.yaml b/charts/cluster-addons/templates/metrics-server.yaml new file mode 100644 index 0000000..7c781b1 --- /dev/null +++ b/charts/cluster-addons/templates/metrics-server.yaml @@ -0,0 +1,35 @@ +{{- if .Values.metricsServer.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "metrics-server") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "metrics-server") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + args: + - --kubelet-insecure-tls + overrides: | + {{- toYaml .Values.metricsServer.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "metrics-server") }} + labels: {{ include "cluster-addons.componentLabels" (list . "metrics-server") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.metricsServer.chart | nindent 4 }} + targetNamespace: {{ .Values.metricsServer.release.namespace }} + releaseName: metrics-server + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "metrics-server") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "metrics-server") }}-config + key: overrides +{{- end }} diff --git a/charts/cluster-addons/templates/migrate-job.yaml b/charts/cluster-addons/templates/migrate-job.yaml new file mode 100644 index 0000000..8844388 --- /dev/null +++ b/charts/cluster-addons/templates/migrate-job.yaml @@ -0,0 +1,139 @@ +{{/* + Job that cleans up artifacts from the previous job-based addon installation + in preparation for creating addon objects. + + We only produce the job if jobs from a previous installation exist. +*/}} +{{- $clusterName := include "cluster-addons.clusterName" . }} +{{- $exists := false }} +{{- range $job := (lookup "batch/v1" "Job" .Release.Namespace "").items }} + {{- + $exists = or + $exists + (and + (index $job.metadata.labels "app.kubernetes.io/name" | default "" | eq "addons") + (index $job.metadata.labels "app.kubernetes.io/instance" | default "" | eq $clusterName) + ) + }} +{{- end }} +{{- if $exists }} +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "cluster-addons.componentName" (list . "addons-migrate") }} + labels: {{ include "cluster-addons.componentLabels" (list . "addons-migrate") | nindent 4 }} + annotations: + helm.sh/hook: pre-upgrade + helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded +spec: + backoffLimit: {{ .Values.hooks.backoffLimit }} + activeDeadlineSeconds: {{ .Values.hooks.activeDeadlineSeconds }} + template: + metadata: + labels: {{ include "cluster-addons.componentSelectorLabels" (list . "addons-migrate") | nindent 8 }} + spec: + {{- with .Values.hooks.imagePullSecrets }} + imagePullSecrets: {{ toYaml . | nindent 8 }} + {{- end }} + securityContext: {{ toYaml .Values.hooks.podSecurityContext | nindent 8 }} + restartPolicy: OnFailure + containers: + - name: addons-migrate + image: {{ + printf "%s:%s" + .Values.hooks.image.repository + (default .Chart.AppVersion .Values.hooks.image.tag) + }} + imagePullPolicy: {{ .Values.hooks.image.pullPolicy }} + securityContext: {{ toYaml .Values.hooks.securityContext | nindent 12 }} + args: + - /bin/bash + - -c + - | + set -ex + test -f "$KUBECONFIG" || exit 0 + kubectl version || exit 0 + + # Remove all the old kustomize releases where possible + helm status -n kustomize-releases ccm-openstack && \ + helm delete -n kustomize-releases ccm-openstack + helm status -n kustomize-releases metrics-server && \ + helm delete -n kustomize-releases metrics-server + + # The csi-cinder kustomize release contains the Cinder storage class, which we cannot delete + # if there are volumes associated with it + # Instead, we move the release to the new namespace, move the storage class into a separate + # release and annotate the storage class so that it doesn't get removed by the Helm upgrade + if helm status -n kustomize-releases csi-cinder; then + helm-move csi-cinder kustomize-releases {{ .Values.openstack.targetNamespace }} + helm-adopt \ + csi-cinder-storageclass \ + {{ .Values.openstack.targetNamespace }} \ + storageclass/{{ .Values.openstack.csiCinder.storageClass.name }} + kubectl annotate \ + storageclass/{{ .Values.openstack.csiCinder.storageClass.name }} \ + "helm.sh/resource-policy=keep" + fi + + # Adopt resources previously created in post-install scripts into the relevant Helm releases + helm-adopt \ + cni-calico \ + {{ .Values.cni.calico.release.namespace }} \ + installation/default + helm-adopt \ + kube-prometheus-stack-dashboards \ + {{ .Values.monitoring.kubePrometheusStack.release.namespace }} \ + configmap/nvidia-dcgm-exporter-dashboard \ + --namespace {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + helm-adopt \ + loki-stack-dashboards \ + {{ .Values.monitoring.kubePrometheusStack.release.namespace }} \ + configmap/loki-stack-grafana-datasource \ + --namespace {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + helm-adopt \ + loki-stack-dashboards \ + {{ .Values.monitoring.kubePrometheusStack.release.namespace }} \ + configmap/loki-stack-grafana-dashboard \ + --namespace {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + + # With the version bump to 40.x, kube-prometheus-stack picks up prometheus-node-exporter 4.x + # This changes the selector labels on the daemonset, which is an immutable field, so we remove + # the daemonset with the old labels before upgrading + # https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#from-39x-to-40x + kubectl delete daemonset \ + -l release=kube-prometheus-stack,app=prometheus-node-exporter \ + -n {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + + # With the version bump from 2.6.3 to 2.6.4, loki-stack picks up an updated promtail that + # changes the selector labels on the daemonset, which is an immutable field + # So we remove the daemonset with the old labels before upgrading + kubectl delete daemonset \ + -l release=loki-stack,app=promtail \ + -n {{ .Values.monitoring.lokiStack.release.namespace }} + env: + - name: KUBECONFIG + value: /etc/kubernetes/config + resources: {{ toYaml .Values.hooks.resources | nindent 12 }} + volumeMounts: + - name: etc-kubernetes + mountPath: /etc/kubernetes + readOnly: true + hostNetwork: {{ .Values.hooks.hostNetwork }} + {{- with .Values.hooks.nodeSelector }} + nodeSelector: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.hooks.affinity }} + affinity: {{ toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.hooks.tolerations }} + tolerations: {{ toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: etc-kubernetes + secret: + secretName: {{ include "cluster-addons.componentName" (list . "kubeconfig") }} + optional: true + items: + - key: value + path: config +{{- end }} diff --git a/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml new file mode 100644 index 0000000..e48eff8 --- /dev/null +++ b/charts/cluster-addons/templates/monitoring/kube-prometheus-stack.yaml @@ -0,0 +1,51 @@ +{{- if .Values.monitoring.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + values: | + {{- toYaml .Values.monitoring.kubePrometheusStack.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }} + labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.monitoring.kubePrometheusStack.chart | nindent 4 }} + targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + releaseName: kube-prometheus-stack + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-config + key: values +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "kube-prometheus-stack") }}-dashboards + labels: {{ include "cluster-addons.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + releaseName: kube-prometheus-stack-dashboards + manifestSources: + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: nvidia-dcgm-exporter-dashboard + labels: + grafana_dashboard: "1" + data: + nvidia-dcgm-exporter-dashboard.json: | + {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} +{{- end }} diff --git a/charts/cluster-addons/templates/monitoring/loki-stack.yaml b/charts/cluster-addons/templates/monitoring/loki-stack.yaml new file mode 100644 index 0000000..453b56a --- /dev/null +++ b/charts/cluster-addons/templates/monitoring/loki-stack.yaml @@ -0,0 +1,67 @@ +{{- if and .Values.monitoring.enabled .Values.monitoring.lokiStack.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "loki-stack") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "loki-stack") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + values: | + {{- toYaml .Values.monitoring.lokiStack.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "loki-stack") }} + labels: {{ include "cluster-addons.componentLabels" (list . "loki-stack") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.monitoring.lokiStack.chart | nindent 4 }} + targetNamespace: {{ .Values.monitoring.lokiStack.release.namespace }} + releaseName: loki-stack + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "loki-stack") }}-config + key: values +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: Manifests +metadata: + name: {{ include "cluster-addons.componentName" (list . "loki-stack") }}-dashboards + labels: {{ include "cluster-addons.componentLabels" (list . "loki-stack") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + targetNamespace: {{ .Values.monitoring.kubePrometheusStack.release.namespace }} + releaseName: loki-stack-dashboards + manifestSources: + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: loki-stack-grafana-datasource + labels: + grafana_datasource: "1" + data: + loki-datasource.yaml: |- + apiVersion: 1 + datasources: + - name: Loki + type: loki + url: http://loki-stack.{{ .Values.monitoring.lokiStack.release.namespace }}:3100 + access: proxy + version: 1 + - template: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: loki-stack-grafana-dashboard + labels: + grafana_dashboard: "1" + data: + loki-dashboard.json: | + {{- .Files.Get "grafana-dashboards/loki-dashboard.json" | nindent 12 }} +{{- end }} diff --git a/charts/cluster-addons/templates/nfd.yaml b/charts/cluster-addons/templates/nfd.yaml new file mode 100644 index 0000000..74648e1 --- /dev/null +++ b/charts/cluster-addons/templates/nfd.yaml @@ -0,0 +1,62 @@ +{{- + if or + .Values.nodeFeatureDiscovery.enabled + .Values.nvidiaGPUOperator.enabled + .Values.mellanoxNetworkOperator.enabled +}} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "node-feature-discovery") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "node-feature-discovery") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + master: + extraLabelNs: + - nvidia.com + worker: + # Allow the NFD pods to be scheduled on all pods + tolerations: + - effect: "NoSchedule" + operator: "Exists" + # We want to be able to identify nodes with high-performance hardware + # So the whitelisted device classes are: + # 02 - Network Controllers (e.g. Ethernet, Infiniband) + # 03 - Display Controllers (e.g. GPUs) + # 0b40 - Co-processors + # 12 - Processing Accelerators (e.g. specialised AI inference chips) + config: + sources: + pci: + deviceClassWhitelist: + - "02" + - "03" + - "0b40" + - "12" + deviceLabelFields: + - vendor + overrides: | + {{- toYaml .Values.nodeFeatureDiscovery.release.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "node-feature-discovery") }} + labels: {{ include "cluster-addons.componentLabels" (list . "node-feature-discovery") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.nodeFeatureDiscovery.chart | nindent 4 }} + targetNamespace: {{ .Values.nodeFeatureDiscovery.release.namespace }} + releaseName: node-feature-discovery + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "node-feature-discovery") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "node-feature-discovery") }}-config + key: overrides +{{- end }} diff --git a/charts/openstack-cluster/templates/addons/nvidia-gpu-operator.yaml b/charts/cluster-addons/templates/nvidia-gpu-operator.yaml similarity index 52% rename from charts/openstack-cluster/templates/addons/nvidia-gpu-operator.yaml rename to charts/cluster-addons/templates/nvidia-gpu-operator.yaml index 547f984..e7a2f2d 100644 --- a/charts/openstack-cluster/templates/addons/nvidia-gpu-operator.yaml +++ b/charts/cluster-addons/templates/nvidia-gpu-operator.yaml @@ -1,12 +1,12 @@ -{{- if and .Values.addons.enabled .Values.addons.nvidiaGPUOperator.enabled }} +{{- if .Values.nvidiaGPUOperator.enabled }} --- apiVersion: v1 kind: Secret metadata: - name: {{ include "openstack-cluster.componentName" (list . "nvidia-gpu-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "nvidia-gpu-operator") }}-config labels: - {{- include "openstack-cluster.componentLabels" (list . "nvidia-gpu-operator") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" + {{- include "cluster-addons.componentLabels" (list . "nvidia-gpu-operator") | nindent 4 }} + addons.stackhpc.com/watch: "" stringData: defaults: | # Use the shared NFD @@ -28,24 +28,24 @@ stringData: - name: "CONTAINERD_CONFIG" value: "/etc/containerd/conf.d/nvidia.toml" overrides: | - {{- toYaml .Values.addons.nvidiaGPUOperator.release.values | nindent 4 }} + {{- toYaml .Values.nvidiaGPUOperator.release.values | nindent 4 }} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: HelmRelease metadata: - name: {{ include "openstack-cluster.componentName" (list . "nvidia-gpu-operator") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "nvidia-gpu-operator") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "nvidia-gpu-operator") }} + labels: {{ include "cluster-addons.componentLabels" (list . "nvidia-gpu-operator") | nindent 4 }} spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} + clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - chart: {{ toYaml .Values.addons.nvidiaGPUOperator.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.nvidiaGPUOperator.release.namespace }} + chart: {{ toYaml .Values.nvidiaGPUOperator.chart | nindent 4 }} + targetNamespace: {{ .Values.nvidiaGPUOperator.release.namespace }} releaseName: nvidia-gpu-operator valuesSources: - secret: - name: {{ include "openstack-cluster.componentName" (list . "nvidia-gpu-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "nvidia-gpu-operator") }}-config key: defaults - secret: - name: {{ include "openstack-cluster.componentName" (list . "nvidia-gpu-operator") }}-config + name: {{ include "cluster-addons.componentName" (list . "nvidia-gpu-operator") }}-config key: overrides {{- end }} diff --git a/charts/cluster-addons/templates/openstack/ccm.yaml b/charts/cluster-addons/templates/openstack/ccm.yaml new file mode 100644 index 0000000..9a91d3b --- /dev/null +++ b/charts/cluster-addons/templates/openstack/ccm.yaml @@ -0,0 +1,47 @@ +{{- if and .Values.openstack.enabled .Values.openstack.ccm.enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "cluster-addons.componentName" (list . "ccm-openstack") }}-config + labels: + {{- include "cluster-addons.componentLabels" (list . "ccm-openstack") | nindent 4 }} + addons.stackhpc.com/watch: "" +stringData: + defaults: | + secret: + create: false + cluster: + name: {{ include "cluster-addons.clusterName" . }} + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node.cloudprovider.kubernetes.io/uninitialized + value: "true" + effect: NoSchedule + - key: node-role.kubernetes.io/master + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + overrides: | + {{- toYaml .Values.openstack.ccm.values | nindent 4 }} +--- +apiVersion: addons.stackhpc.com/v1alpha1 +kind: HelmRelease +metadata: + name: {{ include "cluster-addons.componentName" (list . "ccm-openstack") }} + labels: {{ include "cluster-addons.componentLabels" (list . "ccm-openstack") | nindent 4 }} +spec: + clusterName: {{ include "cluster-addons.clusterName" . }} + bootstrap: true + chart: {{ toYaml .Values.openstack.ccm.chart | nindent 4 }} + targetNamespace: {{ .Values.openstack.targetNamespace }} + releaseName: ccm-openstack + valuesSources: + - secret: + name: {{ include "cluster-addons.componentName" (list . "ccm-openstack") }}-config + key: defaults + - secret: + name: {{ include "cluster-addons.componentName" (list . "ccm-openstack") }}-config + key: overrides +{{- end }} diff --git a/charts/openstack-cluster/templates/addons/openstack/cloud-config.yaml b/charts/cluster-addons/templates/openstack/cloud-config.yaml similarity index 74% rename from charts/openstack-cluster/templates/addons/openstack/cloud-config.yaml rename to charts/cluster-addons/templates/openstack/cloud-config.yaml index 9e81afd..7fd906e 100644 --- a/charts/openstack-cluster/templates/addons/openstack/cloud-config.yaml +++ b/charts/cluster-addons/templates/openstack/cloud-config.yaml @@ -1,14 +1,14 @@ -{{- if and .Values.addons.enabled .Values.addons.openstack.enabled }} +{{- if .Values.openstack.enabled }} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: Manifests metadata: - name: {{ include "openstack-cluster.componentName" (list . "cloud-config") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "cloud-config") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "cloud-config") }} + labels: {{ include "cluster-addons.componentLabels" (list . "cloud-config") | nindent 4 }} spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} + clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - targetNamespace: {{ .Values.addons.openstack.targetNamespace }} + targetNamespace: {{ .Values.openstack.targetNamespace }} releaseName: cloud-config manifestSources: - template: | @@ -23,14 +23,14 @@ spec: [Global] use-clouds=true clouds-file=/etc/config/clouds.yaml - cloud={{ .Values.cloudName }} + cloud=openstack {%- if "cacert" in cloud_identity.data %} ca-file=/etc/config/cacert {%- else %} tls-insecure=true {%- endif %} [Networking] - {{- $networkingItems := default dict .Values.addons.openstack.cloudConfig.Networking }} + {{- $networkingItems := default dict .Values.openstack.cloudConfig.Networking }} {{- if hasKey $networkingItems "internal-network-name" }} internal-network-name={{ index $networkingItems "internal-network-name" }} {{- else }} @@ -40,7 +40,7 @@ spec: {{ $netName }}={{ $netValue }} {{- end }} [LoadBalancer] - {{- $lbItems := default dict .Values.addons.openstack.cloudConfig.LoadBalancer }} + {{- $lbItems := default dict .Values.openstack.cloudConfig.LoadBalancer }} {{- if hasKey $lbItems "floating-network-id" }} floating-network-id={{ index $lbItems "floating-network-id" }} {{- else }} @@ -51,7 +51,7 @@ spec: {{- end }} {{- range $section, $items := omit - .Values.addons.openstack.cloudConfig + .Values.openstack.cloudConfig "Global" "LoadBalancer" "Networking" diff --git a/charts/openstack-cluster/templates/addons/openstack/csi-cinder.yaml b/charts/cluster-addons/templates/openstack/csi-cinder.yaml similarity index 55% rename from charts/openstack-cluster/templates/addons/openstack/csi-cinder.yaml rename to charts/cluster-addons/templates/openstack/csi-cinder.yaml index c208b51..7cbeba3 100644 --- a/charts/openstack-cluster/templates/addons/openstack/csi-cinder.yaml +++ b/charts/cluster-addons/templates/openstack/csi-cinder.yaml @@ -1,17 +1,12 @@ -{{- - if and - .Values.addons.enabled - .Values.addons.openstack.enabled - .Values.addons.openstack.csiCinder.enabled -}} +{{- if and .Values.openstack.enabled .Values.openstack.csiCinder.enabled }} --- apiVersion: v1 kind: Secret metadata: - name: {{ include "openstack-cluster.componentName" (list . "csi-cinder") }}-config + name: {{ include "cluster-addons.componentName" (list . "csi-cinder") }}-config labels: - {{- include "openstack-cluster.componentLabels" (list . "csi-cinder") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" + {{- include "cluster-addons.componentLabels" (list . "csi-cinder") | nindent 4 }} + addons.stackhpc.com/watch: "" stringData: # By default, we disable the storage class deployed by the cinder-csi chart # We deploy our own instead as it gives us more control over the parameters @@ -35,43 +30,43 @@ stringData: readOnly: true storageClass: enabled: false - clusterID: {{ include "openstack-cluster.clusterName" . }} + clusterID: {{ include "cluster-addons.clusterName" . }} overrides: | - {{- toYaml .Values.addons.openstack.csiCinder.values | nindent 4 }} + {{- toYaml .Values.openstack.csiCinder.values | nindent 4 }} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: HelmRelease metadata: - name: {{ include "openstack-cluster.componentName" (list . "csi-cinder") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "csi-cinder") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "csi-cinder") }} + labels: {{ include "cluster-addons.componentLabels" (list . "csi-cinder") | nindent 4 }} spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} + clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - chart: {{ toYaml .Values.addons.openstack.csiCinder.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.openstack.targetNamespace }} + chart: {{ toYaml .Values.openstack.csiCinder.chart | nindent 4 }} + targetNamespace: {{ .Values.openstack.targetNamespace }} releaseName: csi-cinder valuesSources: - secret: - name: {{ include "openstack-cluster.componentName" (list . "csi-cinder") }}-config + name: {{ include "cluster-addons.componentName" (list . "csi-cinder") }}-config key: defaults - secret: - name: {{ include "openstack-cluster.componentName" (list . "csi-cinder") }}-config + name: {{ include "cluster-addons.componentName" (list . "csi-cinder") }}-config key: overrides -{{- if .Values.addons.openstack.csiCinder.storageClass.enabled }} +{{- if .Values.openstack.csiCinder.storageClass.enabled }} --- apiVersion: addons.stackhpc.com/v1alpha1 kind: Manifests metadata: - name: {{ include "openstack-cluster.componentName" (list . "csi-cinder") }}-storageclass - labels: {{ include "openstack-cluster.componentLabels" (list . "csi-cinder") | nindent 4 }} + name: {{ include "cluster-addons.componentName" (list . "csi-cinder") }}-storageclass + labels: {{ include "cluster-addons.componentLabels" (list . "csi-cinder") | nindent 4 }} spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} + clusterName: {{ include "cluster-addons.clusterName" . }} bootstrap: true - targetNamespace: {{ .Values.addons.openstack.targetNamespace }} + targetNamespace: {{ .Values.openstack.targetNamespace }} releaseName: csi-cinder-storageclass manifestSources: - template: | - {{- with .Values.addons.openstack.csiCinder.storageClass }} + {{- with .Values.openstack.csiCinder.storageClass }} apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: diff --git a/charts/cluster-addons/values.yaml b/charts/cluster-addons/values.yaml new file mode 100644 index 0000000..11f2d20 --- /dev/null +++ b/charts/cluster-addons/values.yaml @@ -0,0 +1,214 @@ +# The name of the Cluster API cluster +# If not given, the release name is used +clusterName: + +# Settings for hook jobs +hooks: + image: + repository: ghcr.io/stackhpc/capi-helm-utils + tag: # Defaults to chart appVersion if not given + pullPolicy: IfNotPresent + imagePullSecrets: [] + backoffLimit: 1000 + activeDeadlineSeconds: 3600 + podSecurityContext: + runAsNonRoot: true + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ALL] + readOnlyRootFilesystem: true + resources: {} + hostNetwork: false + tolerations: [] + nodeSelector: {} + affinity: {} + +# Settings for the CNI addon +cni: + # Indicates if a CNI should be deployed + enabled: true + # The CNI to deploy - supported values are calico or cilium + type: calico + # Settings for the calico CNI + # See https://projectcalico.docs.tigera.io/getting-started/kubernetes/helm + calico: + chart: + repo: https://projectcalico.docs.tigera.io/charts + name: tigera-operator + version: v3.23.3 + release: + namespace: tigera-operator + values: {} + # Settings for the Cilium CNI + # See https://docs.cilium.io/en/stable/gettingstarted/k8s-install-helm/ for details + cilium: + chart: + repo: https://helm.cilium.io/ + name: cilium + version: 1.11.1 + release: + namespace: kube-system + values: {} + +# Settings for the OpenStack integrations +openstack: + # Indicates if the OpenStack integrations should be enabled + enabled: false + # The target namespace for the OpenStack integrations + targetNamespace: openstack-system + # cloud-config options for the OpenStack integrations + # The [Global] section is configured to use the target cloud + # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/openstack-cloud-controller-manager/using-openstack-cloud-controller-manager.md#config-openstack-cloud-controller-manager + # and https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/cinder-csi-plugin/using-cinder-csi-plugin.md#block-storage + cloudConfig: + # By default, ignore volume AZs for Cinder as most clouds have a single globally-attachable Cinder AZ + BlockStorage: + ignore-volume-az: true + # Settings for the Cloud Controller Manager (CCM) + ccm: + # Indicates if the OpenStack CCM should be enabled + # By default, the CCM is enabled if the OpenStack integrations are enabled + # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/charts/openstack-cloud-controller-manager/values.yaml + enabled: true + chart: + repo: https://kubernetes.github.io/cloud-provider-openstack + name: openstack-cloud-controller-manager + version: 1.3.0 + values: {} + # Settings for the Cinder CSI plugin + csiCinder: + # Indicates if the Cinder CSI should be enabled + # By default, it is enabled if the OpenStack integrations are enabled + # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/charts/cinder-csi-plugin/values.yaml + enabled: true + chart: + repo: https://kubernetes.github.io/cloud-provider-openstack + name: openstack-cinder-csi + version: 2.2.0 + values: {} + # Variables affecting the definition of the storage class + storageClass: + # Indicates if the storage class should be enabled + enabled: true + # The name of the storage class + name: csi-cinder + # Indicates if the storage class should be annotated as the default storage class + isDefault: true + # The reclaim policy for the storage class + reclaimPolicy: Delete + # Indicates if volume expansion is allowed + allowVolumeExpansion: true + # The Cinder availability zone to use for volumes provisioned by the storage class + availabilityZone: nova + # The Cinder volume type to use for volumes provisioned by the storage class + # If not given, the default volume type will be used + volumeType: + # The allowed topologies for the storage class + allowedTopologies: + +# Settings for the metrics server +# https://github.com/kubernetes-sigs/metrics-server#helm-chart +metricsServer: + # Indicates if the metrics server should be deployed + enabled: true + chart: + repo: https://kubernetes-sigs.github.io/metrics-server + name: metrics-server + version: 3.8.2 + release: + namespace: kube-system + values: {} + +# Settings for the Kubernetes dashboard +# https://github.com/kubernetes/dashboard/tree/master/charts/helm-chart/kubernetes-dashboard +kubernetesDashboard: + # Indicates if the Kubernetes dashboard should be enabled + enabled: false + chart: + repo: https://kubernetes.github.io/dashboard + name: kubernetes-dashboard + version: 5.10.0 + release: + namespace: kubernetes-dashboard + values: {} + +# Settings for ingress controllers +ingress: + # Settings for the Nginx ingress controller + # https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx#configuration + nginx: + # Indicates if the Nginx ingress controller should be enabled + enabled: false + chart: + repo: https://kubernetes.github.io/ingress-nginx + name: ingress-nginx + version: 4.2.5 + release: + namespace: ingress-nginx + values: {} + +# Settings for cluster monitoring +monitoring: + # Indicates if the cluster monitoring should be enabled + enabled: false + kubePrometheusStack: + chart: + repo: https://prometheus-community.github.io/helm-charts + name: kube-prometheus-stack + version: 40.1.0 + release: + namespace: monitoring-system + values: {} + lokiStack: + enabled: true + chart: + repo: https://grafana.github.io/helm-charts + name: loki-stack + version: 2.8.2 + release: + namespace: monitoring-system + values: {} + +# Settings for node feature discovery +# https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery +nodeFeatureDiscovery: + # Indicates if node feature discovery should be enabled + enabled: true + chart: + repo: https://kubernetes-sigs.github.io/node-feature-discovery/charts + name: node-feature-discovery + version: 0.11.2 + release: + namespace: node-feature-discovery + values: {} + +# Settings for the NVIDIA GPU operator +nvidiaGPUOperator: + # Indicates if the NVIDIA GPU operator should be enabled + # Note that because it uses node feature discovery to run only on nodes + # with an NVIDIA GPU available, the overhead of enabling this on clusters + # that do not need it now but may need it in the future is low + enabled: true + chart: + repo: https://nvidia.github.io/gpu-operator + name: gpu-operator + version: v1.11.1 + release: + namespace: gpu-operator + values: {} + +# Settings for the Mellanox network operator +mellanoxNetworkOperator: + # Indicates if the network operator should be enabled + # Note that because it uses node feature discovery to run only on nodes + # with a Mellanox NIC available, the overhead of enabling this on clusters + # that do not need it now but may need it in the future is low + enabled: true + chart: + repo: https://mellanox.github.io/network-operator + name: network-operator + version: 1.3.0 + release: + namespace: network-operator + values: {} diff --git a/charts/openstack-cluster/Chart.yaml b/charts/openstack-cluster/Chart.yaml index 283c47a..56f24df 100644 --- a/charts/openstack-cluster/Chart.yaml +++ b/charts/openstack-cluster/Chart.yaml @@ -4,3 +4,10 @@ description: Helm chart for deploying a cluster on an OpenStack cloud using Clus type: application version: 0.1.0 appVersion: main + +dependencies: + - name: cluster-addons + version: ">=0-0" + repository: file://../cluster-addons + alias: addons + condition: addons.enabled diff --git a/charts/openstack-cluster/README.md b/charts/openstack-cluster/README.md index b713aff..cf6316f 100644 --- a/charts/openstack-cluster/README.md +++ b/charts/openstack-cluster/README.md @@ -5,8 +5,9 @@ cluster on an [OpenStack](https://www.openstack.org/) cloud using [Cluster API](https://cluster-api.sigs.k8s.io/). As well as managing the Cluster API resources for the cluster, this chart optionally -manages addons for the cluster using Kubernetes jobs. Some of these are required for -a functional cluster, e.g. a +manages addons for the cluster using addon resources from the +[Cluster API Addon Provider](https://github.com/stackhpc/cluster-api-addon-provider). +Some of these are required for a functional cluster, e.g. a [Container Network Interface (CNI) plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/) and the [OpenStack Cloud Controller Manager (CCM)](https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/openstack-cloud-controller-manager/using-openstack-cloud-controller-manager.md), and @@ -26,6 +27,20 @@ First, you must set up a with the [OpenStack Infrastructure Provider](https://github.com/kubernetes-sigs/cluster-api-provider-openstack) installed. +> **WARNING** +> +> This chart depends on features in +> [cluster-api-provider-openstack](https://github.com/kubernetes-sigs/cluster-api-provider-openstack) +> that are not yet in a release. +> +> StackHPC maintain custom builds of `cluster-api-provider-openstack` for use with this chart. +> You can find these in [the StackHPC fork](https://github.com/stackhpc/cluster-api-provider-openstack/releases) +> of `cluster-api-provider-openstack`. + +Addons are managed by the +[Cluster API Addon Provider](https://github.com/stackhpc/cluster-api-addon-provider), +which must also be installed if you wish to use the addons functionality. + In addition, Helm must be installed and configured to access your management cluster, and the chart repository containing this chart must be configured: @@ -172,12 +187,8 @@ command again. Some examples of updates that can be performed are: ### Cluster addons -The cluster addons are enabled by default, however by default only a CNI, the -[Metrics Server](https://github.com/kubernetes-sigs/metrics-server) and the -OpenStack CCM and Cinder CSI are enabled. - -You can configure which addons are deployed and the configuration of those addons -by specifying values for the addons Helm chart: +The cluster addons are enabled by default. You can configure which addons are deployed +and the configuration of those addons by specifying values for the addons Helm chart: ```yaml addons: @@ -194,8 +205,8 @@ The cluster addons also can be disabled completely using the following configura > **WARNING** > > If the cluster addons are disabled, you will need to manually install a CNI -> plugin and the OpenStack Cloud Controller Manager before the cluster deployment -> will complete successfully. +> and the OpenStack Cloud Controller Manager before the cluster deployment will +> complete successfully. ```yaml addons: diff --git a/charts/openstack-cluster/templates/addons/_helpers.tpl b/charts/openstack-cluster/templates/addons/_helpers.tpl deleted file mode 100644 index 7f335a9..0000000 --- a/charts/openstack-cluster/templates/addons/_helpers.tpl +++ /dev/null @@ -1,95 +0,0 @@ -{{/* - Determines if this revision is a migration from the previous addon method by - checking for addon jobs for the cluster. -*/}} -{{- define "openstack-cluster.addons.isMigration" -}} - {{- $clusterName := include "openstack-cluster.clusterName" . }} - {{- $exists := false }} - {{- range $job := (lookup "batch/v1" "Job" .Release.Namespace "").items }} - {{- - if and - (hasKey $job.metadata.labels "app.kubernetes.io/name") - (hasKey $job.metadata.labels "app.kubernetes.io/instance") - }} - {{- - $exists = or - $exists - (and - (index $job.metadata.labels "app.kubernetes.io/name" | eq "addons") - (index $job.metadata.labels "app.kubernetes.io/instance" | eq $clusterName) - ) - }} - {{- end }} - {{- end }} - {{- ternary "true" "" $exists }} -{{- end }} - -{{- define "openstack-cluster.addons.hookJob" -}} -{{- $ctx := index . 0 }} -{{- $hook := index . 1 }} -{{- $componentName := index . 2 }} -{{- $scriptTemplate := index . 3 }} -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ include "openstack-cluster.componentName" (list $ctx $componentName) }} - labels: {{ include "openstack-cluster.componentLabels" (list $ctx $componentName) | nindent 4 }} - annotations: - helm.sh/hook: {{ $hook }} - helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded -spec: - backoffLimit: {{ $ctx.Values.addons.hooks.backoffLimit }} - activeDeadlineSeconds: {{ $ctx.Values.addons.hooks.activeDeadlineSeconds }} - template: - metadata: - labels: {{ include "openstack-cluster.componentSelectorLabels" (list $ctx $componentName) | nindent 8 }} - spec: - {{- with $ctx.Values.addons.hooks.imagePullSecrets }} - imagePullSecrets: {{ toYaml . | nindent 8 }} - {{- end }} - securityContext: {{ toYaml $ctx.Values.addons.hooks.podSecurityContext | nindent 8 }} - restartPolicy: OnFailure - containers: - - name: {{ $componentName }} - image: {{ - printf "%s:%s" - $ctx.Values.addons.hooks.image.repository - (default $ctx.Chart.AppVersion $ctx.Values.addons.hooks.image.tag) - }} - imagePullPolicy: {{ $ctx.Values.addons.hooks.image.pullPolicy }} - securityContext: {{ toYaml $ctx.Values.addons.hooks.securityContext | nindent 12 }} - args: - - /bin/bash - - -c - - | - set -ex - test -f "$KUBECONFIG" || exit 0 - kubectl version || exit 0 - {{- include $scriptTemplate $ctx | nindent 16 }} - env: - - name: KUBECONFIG - value: /etc/kubernetes/config - resources: {{ toYaml $ctx.Values.addons.hooks.resources | nindent 12 }} - volumeMounts: - - name: etc-kubernetes - mountPath: /etc/kubernetes - readOnly: true - hostNetwork: {{ $ctx.Values.addons.hooks.hostNetwork }} - {{- with $ctx.Values.addons.hooks.nodeSelector }} - nodeSelector: {{ toYaml . | nindent 8 }} - {{- end }} - {{- with $ctx.Values.addons.hooks.affinity }} - affinity: {{ toYaml . | nindent 8 }} - {{- end }} - {{- with $ctx.Values.addons.hooks.tolerations }} - tolerations: {{ toYaml . | nindent 8 }} - {{- end }} - volumes: - - name: etc-kubernetes - secret: - secretName: {{ include "openstack-cluster.componentName" (list $ctx "kubeconfig") }} - optional: true - items: - - key: value - path: config -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/cni/calico.yaml b/charts/openstack-cluster/templates/addons/cni/calico.yaml deleted file mode 100644 index ae602d7..0000000 --- a/charts/openstack-cluster/templates/addons/cni/calico.yaml +++ /dev/null @@ -1,48 +0,0 @@ -{{- - if and - .Values.addons.enabled - .Values.addons.cni.enabled - (eq .Values.addons.cni.type "calico") -}} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "cni-calico") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "cni-calico") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - defaults: | - installation: - calicoNetwork: - bgp: Disabled - nodeAddressAutodetectionV4: - kubernetes: NodeInternalIP - ipPools: - {% for cidr in cluster.spec.clusterNetwork.pods.cidrBlocks %} - - cidr: {{ "{{" }} cidr {{ "}}" }} - encapsulation: VXLAN - {% endfor %} - overrides: | - {{- toYaml .Values.addons.cni.calico.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "cni-calico") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "cni-calico") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.cni.calico.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.cni.calico.release.namespace }} - releaseName: cni-calico - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "cni-calico") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "cni-calico") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/cni/cilium.yaml b/charts/openstack-cluster/templates/addons/cni/cilium.yaml deleted file mode 100644 index 0e3d688..0000000 --- a/charts/openstack-cluster/templates/addons/cni/cilium.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- - if and - .Values.addons.enabled - .Values.addons.cni.enabled - (eq .Values.addons.cni.type "cilium") -}} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "cni-cilium") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "cni-cilium") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - defaults: | - ipam: - mode: kubernetes - overrides: | - {{- toYaml .Values.addons.cni.cilium.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "cni-cilium") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "cni-cilium") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.cni.cilium.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.cni.cilium.release.namespace }} - releaseName: cni-cilium - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "cni-cilium") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "cni-cilium") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/ingress-nginx.yaml b/charts/openstack-cluster/templates/addons/ingress-nginx.yaml deleted file mode 100644 index 4e08945..0000000 --- a/charts/openstack-cluster/templates/addons/ingress-nginx.yaml +++ /dev/null @@ -1,29 +0,0 @@ -{{- if and .Values.addons.enabled .Values.addons.ingress.nginx.enabled }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "ingress-nginx") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "ingress-nginx") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - values: | - {{- toYaml .Values.addons.ingress.nginx.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "ingress-nginx") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "ingress-nginx") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.ingress.nginx.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.ingress.nginx.release.namespace }} - releaseName: ingress-nginx - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "ingress-nginx") }}-config - key: values -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/kubernetes-dashboard.yaml b/charts/openstack-cluster/templates/addons/kubernetes-dashboard.yaml deleted file mode 100644 index a138595..0000000 --- a/charts/openstack-cluster/templates/addons/kubernetes-dashboard.yaml +++ /dev/null @@ -1,36 +0,0 @@ -{{- if and .Values.addons.enabled .Values.addons.kubernetesDashboard.enabled }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "kubernetes-dashboard") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "kubernetes-dashboard") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - # Enable the metrics scraper by default - defaults: | - metricsScraper: - enabled: true - overrides: | - {{- toYaml .Values.addons.kubernetesDashboard.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "kubernetes-dashboard") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "kubernetes-dashboard") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.kubernetesDashboard.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.kubernetesDashboard.release.namespace }} - releaseName: kubernetes-dashboard - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "kubernetes-dashboard") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "kubernetes-dashboard") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/metrics-server.yaml b/charts/openstack-cluster/templates/addons/metrics-server.yaml deleted file mode 100644 index f8ae1e3..0000000 --- a/charts/openstack-cluster/templates/addons/metrics-server.yaml +++ /dev/null @@ -1,35 +0,0 @@ -{{- if and .Values.addons.enabled .Values.addons.metricsServer.enabled }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "metrics-server") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "metrics-server") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - defaults: | - args: - - --kubelet-insecure-tls - overrides: | - {{- toYaml .Values.addons.metricsServer.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "metrics-server") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "metrics-server") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.metricsServer.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.metricsServer.release.namespace }} - releaseName: metrics-server - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "metrics-server") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "metrics-server") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/migrate-job.yaml b/charts/openstack-cluster/templates/addons/migrate-job.yaml deleted file mode 100644 index 6b84540..0000000 --- a/charts/openstack-cluster/templates/addons/migrate-job.yaml +++ /dev/null @@ -1,62 +0,0 @@ -{{/* - Job that cleans up artifacts from the previous job-based addon installation - in preparation for creating addon objects. - - We only produce the job if addons are enabled but no existing addon resources exist. -*/}} - -{{- define "openstack-cluster.addons.migrateScript" -}} -# Remove all the old kustomize releases where possible -helm status -n kustomize-releases ccm-openstack && \ - helm delete -n kustomize-releases ccm-openstack -helm status -n kustomize-releases metrics-server && \ - helm delete -n kustomize-releases metrics-server - -# The csi-cinder kustomize release contains the Cinder storage class, which we cannot delete -# if there are volumes associated with it -# Instead, we move the release to the new namespace, move the storage class into a separate -# release and annotate the storage class so that it doesn't get removed by the Helm upgrade -if helm status -n kustomize-releases csi-cinder; then - helm-move csi-cinder kustomize-releases {{ .Values.addons.openstack.targetNamespace }} - helm-adopt \ - csi-cinder-storageclass \ - {{ .Values.addons.openstack.targetNamespace }} \ - storageclass/{{ .Values.addons.openstack.csiCinder.storageClass.name }} - kubectl annotate \ - storageclass/{{ .Values.addons.openstack.csiCinder.storageClass.name }} \ - "helm.sh/resource-policy=keep" -fi - -# Adopt resources previously created in post-install scripts into the relevant Helm releases -helm-adopt \ - cni-calico \ - {{ .Values.addons.cni.calico.release.namespace }} \ - installation/default -helm-adopt \ - kube-prometheus-stack-dashboards \ - {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} \ - configmap/nvidia-dcgm-exporter-dashboard \ - --namespace {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} -helm-adopt \ - loki-stack-dashboards \ - {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} \ - configmap/loki-stack-grafana-datasource \ - --namespace {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} -helm-adopt \ - loki-stack-dashboards \ - {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} \ - configmap/loki-stack-grafana-dashboard \ - --namespace {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} -{{- end }} - -{{- - if and - .Values.addons.enabled - (include "openstack-cluster.addons.isMigration" .) -}} -{{- - include - "openstack-cluster.addons.hookJob" - (list . "pre-upgrade" "addons-migrate" "openstack-cluster.addons.migrateScript") -}} -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/monitoring/kube-prometheus-stack.yaml b/charts/openstack-cluster/templates/addons/monitoring/kube-prometheus-stack.yaml deleted file mode 100644 index 9652903..0000000 --- a/charts/openstack-cluster/templates/addons/monitoring/kube-prometheus-stack.yaml +++ /dev/null @@ -1,99 +0,0 @@ -{{- define "openstack-cluster.kube-prometheus-stack.hookScript" -}} -# With the version bump to 40.x, kube-prometheus-stack picks up prometheus-node-exporter 4.x -# This changes the selector labels on the daemonset, which is an immutable field, so we remove -# the daemonset with the old labels before upgrading -# https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#from-39x-to-40x -kubectl delete daemonset \ - -l release=kube-prometheus-stack,app=prometheus-node-exporter \ - -n {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} -{{- end }} - -{{- if and .Values.addons.enabled .Values.addons.monitoring.enabled }} -{{- - $addon := lookup - "addons.stackhpc.com/v1alpha1" - "HelmRelease" - .Release.Namespace - (include "openstack-cluster.componentName" (list . "kube-prometheus-stack")) -}} -{{- - $previousVersion := dig - "spec" - "chart" - "version" - .Values.addons.monitoring.kubePrometheusStack.chart.version - $addon -}} -{{- - if and - .Release.IsUpgrade - (or - (include "openstack-cluster.addons.isMigration" .) - (and - (semverCompare ">=40.0.0" .Values.addons.monitoring.kubePrometheusStack.chart.version) - (semverCompare "<40.0.0" $previousVersion) - ) - ) -}} ---- -{{- - include - "openstack-cluster.addons.hookJob" - (list - . - "pre-upgrade" - "kube-prometheus-stack-migrate" - "openstack-cluster.kube-prometheus-stack.hookScript" - ) -}} -{{- end }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "kube-prometheus-stack") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - values: | - {{- toYaml .Values.addons.monitoring.kubePrometheusStack.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "kube-prometheus-stack") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.monitoring.kubePrometheusStack.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} - releaseName: kube-prometheus-stack - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "kube-prometheus-stack") }}-config - key: values ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: Manifests -metadata: - name: {{ include "openstack-cluster.componentName" (list . "kube-prometheus-stack") }}-dashboards - labels: {{ include "openstack-cluster.componentLabels" (list . "kube-prometheus-stack") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - targetNamespace: {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} - releaseName: kube-prometheus-stack-dashboards - manifestSources: - - template: | - apiVersion: v1 - kind: ConfigMap - metadata: - name: nvidia-dcgm-exporter-dashboard - labels: - grafana_dashboard: "1" - data: - nvidia-dcgm-exporter-dashboard.json: | - {{- .Files.Get "grafana-dashboards/nvidia-dcgm-exporter-dashboard_rev2.json" | nindent 12 }} -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/monitoring/loki-stack.yaml b/charts/openstack-cluster/templates/addons/monitoring/loki-stack.yaml deleted file mode 100644 index 10f464e..0000000 --- a/charts/openstack-cluster/templates/addons/monitoring/loki-stack.yaml +++ /dev/null @@ -1,119 +0,0 @@ -{{- define "openstack-cluster.loki-stack.hookScript" -}} -# With the version bump from 2.6.3 to 2.6.4, loki-stack picks up an updated promtail that -# changes the selector labels on the daemonset, which is an immutable field -# So we remove the daemonset with the old labels before upgrading -kubectl delete daemonset \ - -l release=loki-stack,app=promtail \ - -n {{ .Values.addons.monitoring.lokiStack.release.namespace }} -{{- end }} - -{{- - if and - .Values.addons.enabled - .Values.addons.monitoring.enabled - .Values.addons.monitoring.lokiStack.enabled -}} -{{- - $addon := lookup - "addons.stackhpc.com/v1alpha1" - "HelmRelease" - .Release.Namespace - (include "openstack-cluster.componentName" (list . "loki-stack")) -}} -{{- - $previousVersion := dig - "spec" - "chart" - "version" - .Values.addons.monitoring.lokiStack.chart.version - $addon -}} -{{- - if and - .Release.IsUpgrade - (or - (include "openstack-cluster.addons.isMigration" .) - (and - (semverCompare ">=2.6.4" .Values.addons.monitoring.lokiStack.chart.version) - (semverCompare "<2.6.4" $previousVersion) - ) - ) -}} ---- -{{- - include - "openstack-cluster.addons.hookJob" - (list - . - "pre-upgrade" - "loki-stack-migrate" - "openstack-cluster.loki-stack.hookScript" - ) -}} -{{- end }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "loki-stack") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "loki-stack") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - values: | - {{- toYaml .Values.addons.monitoring.lokiStack.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "loki-stack") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "loki-stack") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.monitoring.lokiStack.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.monitoring.lokiStack.release.namespace }} - releaseName: loki-stack - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "loki-stack") }}-config - key: values ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: Manifests -metadata: - name: {{ include "openstack-cluster.componentName" (list . "loki-stack") }}-dashboards - labels: {{ include "openstack-cluster.componentLabels" (list . "loki-stack") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - targetNamespace: {{ .Values.addons.monitoring.kubePrometheusStack.release.namespace }} - releaseName: loki-stack-dashboards - manifestSources: - - template: | - apiVersion: v1 - kind: ConfigMap - metadata: - name: loki-stack-grafana-datasource - labels: - grafana_datasource: "1" - data: - loki-datasource.yaml: |- - apiVersion: 1 - datasources: - - name: Loki - type: loki - url: http://loki-stack.{{ .Values.addons.monitoring.lokiStack.release.namespace }}:3100 - access: proxy - version: 1 - - template: | - apiVersion: v1 - kind: ConfigMap - metadata: - name: loki-stack-grafana-dashboard - labels: - grafana_dashboard: "1" - data: - loki-dashboard.json: | - {{- .Files.Get "grafana-dashboards/loki-dashboard.json" | nindent 12 }} -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/nfd.yaml b/charts/openstack-cluster/templates/addons/nfd.yaml deleted file mode 100644 index eac3730..0000000 --- a/charts/openstack-cluster/templates/addons/nfd.yaml +++ /dev/null @@ -1,65 +0,0 @@ -{{- - if and - .Values.addons.enabled - (or - .Values.addons.nodeFeatureDiscovery.enabled - .Values.addons.nvidiaGPUOperator.enabled - .Values.addons.mellanoxNetworkOperator.enabled - ) -}} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "node-feature-discovery") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "node-feature-discovery") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - defaults: | - master: - extraLabelNs: - - nvidia.com - worker: - # Allow the NFD pods to be scheduled on all pods - tolerations: - - effect: "NoSchedule" - operator: "Exists" - # We want to be able to identify nodes with high-performance hardware - # So the whitelisted device classes are: - # 02 - Network Controllers (e.g. Ethernet, Infiniband) - # 03 - Display Controllers (e.g. GPUs) - # 0b40 - Co-processors - # 12 - Processing Accelerators (e.g. specialised AI inference chips) - config: - sources: - pci: - deviceClassWhitelist: - - "02" - - "03" - - "0b40" - - "12" - deviceLabelFields: - - vendor - overrides: | - {{- toYaml .Values.addons.nodeFeatureDiscovery.release.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "node-feature-discovery") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "node-feature-discovery") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.nodeFeatureDiscovery.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.nodeFeatureDiscovery.release.namespace }} - releaseName: node-feature-discovery - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "node-feature-discovery") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "node-feature-discovery") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/templates/addons/openstack/ccm.yaml b/charts/openstack-cluster/templates/addons/openstack/ccm.yaml deleted file mode 100644 index 73ebfe1..0000000 --- a/charts/openstack-cluster/templates/addons/openstack/ccm.yaml +++ /dev/null @@ -1,52 +0,0 @@ -{{- - if and - .Values.addons.enabled - .Values.addons.openstack.enabled - .Values.addons.openstack.ccm.enabled -}} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "openstack-cluster.componentName" (list . "ccm-openstack") }}-config - labels: - {{- include "openstack-cluster.componentLabels" (list . "ccm-openstack") | nindent 4 }} - {{ .Values.addons.watchLabel }}: "" -stringData: - defaults: | - secret: - create: false - cluster: - name: {{ include "openstack-cluster.clusterName" . }} - nodeSelector: - node-role.kubernetes.io/control-plane: "" - tolerations: - - key: node.cloudprovider.kubernetes.io/uninitialized - value: "true" - effect: NoSchedule - - key: node-role.kubernetes.io/master - effect: NoSchedule - - key: node-role.kubernetes.io/control-plane - effect: NoSchedule - overrides: | - {{- toYaml .Values.addons.openstack.ccm.values | nindent 4 }} ---- -apiVersion: addons.stackhpc.com/v1alpha1 -kind: HelmRelease -metadata: - name: {{ include "openstack-cluster.componentName" (list . "ccm-openstack") }} - labels: {{ include "openstack-cluster.componentLabels" (list . "ccm-openstack") | nindent 4 }} -spec: - clusterName: {{ include "openstack-cluster.clusterName" . }} - bootstrap: true - chart: {{ toYaml .Values.addons.openstack.ccm.chart | nindent 4 }} - targetNamespace: {{ .Values.addons.openstack.targetNamespace }} - releaseName: ccm-openstack - valuesSources: - - secret: - name: {{ include "openstack-cluster.componentName" (list . "ccm-openstack") }}-config - key: defaults - - secret: - name: {{ include "openstack-cluster.componentName" (list . "ccm-openstack") }}-config - key: overrides -{{- end }} diff --git a/charts/openstack-cluster/values.yaml b/charts/openstack-cluster/values.yaml index ee7d9f6..5fa9317 100644 --- a/charts/openstack-cluster/values.yaml +++ b/charts/openstack-cluster/values.yaml @@ -1,9 +1,3 @@ -# Configuration that is shared between the cluster and the addons -global: - # The Kubernetes version of the cluster - # This should match the version of kubelet and kubeadm in the image - kubernetesVersion: - # The name of an existing secret containing a clouds.yaml and optional cacert cloudCredentialsSecretName: # OR @@ -16,11 +10,14 @@ cloudCACert: # The name of the cloud to use from the specified clouds.yaml cloudName: openstack +# The Kubernetes version of the cluster +# This should match the version of kubelet and kubeadm in the image +kubernetesVersion: + # The name of the image to use for cluster machines -# This is used when creating machines using ephemeral root disks machineImage: +# OR # The ID of the image to use for cluster machines -# This is required when creating machines with volumes as root disks machineImageId: # The name of the SSH key to inject into cluster machines @@ -319,217 +316,6 @@ autoscaler: addons: # Indicates if cluster addons should be deployed enabled: true - - # The label to use to indicate that a configmap or secret should be watched - watchLabel: addons.stackhpc.com/watch - - # Settings for hook jobs - hooks: - image: - repository: ghcr.io/stackhpc/capi-helm-utils - tag: # Defaults to chart appVersion if not given - pullPolicy: IfNotPresent - imagePullSecrets: [] - backoffLimit: 1000 - activeDeadlineSeconds: 3600 - podSecurityContext: - runAsNonRoot: true - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: [ALL] - readOnlyRootFilesystem: true - resources: {} - hostNetwork: false - tolerations: [] - nodeSelector: {} - affinity: {} - - # Settings for the CNI addon - cni: - # Indicates if a CNI should be deployed - enabled: true - # The CNI to deploy - supported values are calico or cilium - type: calico - # Settings for the calico CNI - # See https://projectcalico.docs.tigera.io/getting-started/kubernetes/helm - calico: - chart: - repo: https://projectcalico.docs.tigera.io/charts - name: tigera-operator - version: v3.23.3 - release: - namespace: tigera-operator - values: {} - # Settings for the Cilium CNI - # See https://docs.cilium.io/en/stable/gettingstarted/k8s-install-helm/ for details - cilium: - chart: - repo: https://helm.cilium.io/ - name: cilium - version: 1.11.1 - release: - namespace: kube-system - values: {} - - # Settings for the OpenStack integrations + # Enable the openstack integrations by default openstack: - # Indicates if the OpenStack integrations should be enabled enabled: true - # The target namespace for the OpenStack integrations - targetNamespace: openstack-system - # cloud-config options for the OpenStack integrations - # The [Global] section is configured to use the target cloud - # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/openstack-cloud-controller-manager/using-openstack-cloud-controller-manager.md#config-openstack-cloud-controller-manager - # and https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/cinder-csi-plugin/using-cinder-csi-plugin.md#block-storage - cloudConfig: - # By default, ignore volume AZs for Cinder as most clouds have a single globally-attachable Cinder AZ - BlockStorage: - ignore-volume-az: true - # Settings for the Cloud Controller Manager (CCM) - ccm: - # Indicates if the OpenStack CCM should be enabled - # By default, the CCM is enabled if the OpenStack integrations are enabled - # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/charts/openstack-cloud-controller-manager/values.yaml - enabled: true - chart: - repo: https://kubernetes.github.io/cloud-provider-openstack - name: openstack-cloud-controller-manager - version: 1.3.0 - values: {} - # Settings for the Cinder CSI plugin - csiCinder: - # Indicates if the Cinder CSI should be enabled - # By default, it is enabled if the OpenStack integrations are enabled - # See https://github.com/kubernetes/cloud-provider-openstack/blob/master/charts/cinder-csi-plugin/values.yaml - enabled: true - chart: - repo: https://kubernetes.github.io/cloud-provider-openstack - name: openstack-cinder-csi - version: 2.2.0 - values: {} - # Variables affecting the definition of the storage class - storageClass: - # Indicates if the storage class should be enabled - enabled: true - # The name of the storage class - name: csi-cinder - # Indicates if the storage class should be annotated as the default storage class - isDefault: true - # The reclaim policy for the storage class - reclaimPolicy: Delete - # Indicates if volume expansion is allowed - allowVolumeExpansion: true - # The Cinder availability zone to use for volumes provisioned by the storage class - availabilityZone: nova - # The Cinder volume type to use for volumes provisioned by the storage class - # If not given, the default volume type will be used - volumeType: - # The allowed topologies for the storage class - allowedTopologies: - - # Settings for the metrics server - # https://github.com/kubernetes-sigs/metrics-server#helm-chart - metricsServer: - # Indicates if the metrics server should be deployed - enabled: true - chart: - repo: https://kubernetes-sigs.github.io/metrics-server - name: metrics-server - version: 3.8.2 - release: - namespace: kube-system - values: {} - - # Settings for the Kubernetes dashboard - # https://github.com/kubernetes/dashboard/tree/master/charts/helm-chart/kubernetes-dashboard - kubernetesDashboard: - # Indicates if the Kubernetes dashboard should be enabled - enabled: false - chart: - repo: https://kubernetes.github.io/dashboard - name: kubernetes-dashboard - version: 5.10.0 - release: - namespace: kubernetes-dashboard - values: {} - - # Settings for ingress controllers - ingress: - # Settings for the Nginx ingress controller - # https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx#configuration - nginx: - # Indicates if the Nginx ingress controller should be enabled - enabled: false - chart: - repo: https://kubernetes.github.io/ingress-nginx - name: ingress-nginx - version: 4.2.5 - release: - namespace: ingress-nginx - values: {} - - # Settings for cluster monitoring - monitoring: - # Indicates if the cluster monitoring should be enabled - enabled: false - kubePrometheusStack: - chart: - repo: https://prometheus-community.github.io/helm-charts - name: kube-prometheus-stack - version: 40.1.0 - release: - namespace: monitoring-system - values: {} - lokiStack: - enabled: true - chart: - repo: https://grafana.github.io/helm-charts - name: loki-stack - version: 2.8.2 - release: - namespace: monitoring-system - values: {} - - # Settings for node feature discovery - # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery - nodeFeatureDiscovery: - # Indicates if node feature discovery should be enabled - enabled: true - chart: - repo: https://kubernetes-sigs.github.io/node-feature-discovery/charts - name: node-feature-discovery - version: 0.11.2 - release: - namespace: node-feature-discovery - values: {} - - # Settings for the NVIDIA GPU operator - nvidiaGPUOperator: - # Indicates if the NVIDIA GPU operator should be enabled - # Note that because it uses node feature discovery to run only on nodes - # with an NVIDIA GPU available, the overhead of enabling this on clusters - # that do not need it now but may need it in the future is low - enabled: true - chart: - repo: https://nvidia.github.io/gpu-operator - name: gpu-operator - version: v1.11.1 - release: - namespace: gpu-operator - values: {} - - # Settings for the Mellanox network operator - mellanoxNetworkOperator: - # Indicates if the network operator should be enabled - # Note that because it uses node feature discovery to run only on nodes - # with a Mellanox NIC available, the overhead of enabling this on clusters - # that do not need it now but may need it in the future is low - enabled: true - chart: - repo: https://mellanox.github.io/network-operator - name: network-operator - version: 1.3.0 - release: - namespace: network-operator - values: {}