Add NFD + NVIDIA GPU operator by default
This commit is contained in:
parent
65b7380864
commit
e736e0bb8c
@ -311,6 +311,8 @@ hooks:
|
||||
# These should include environment variables, volume mounts etc. if they need
|
||||
# to target a remote cluster using kubeconfigSecret
|
||||
extraInitContainers: []
|
||||
# Indicates whether a pre-delete hook should be generated for the addon
|
||||
generatePreDeleteHook: true
|
||||
backoffLimit: 1000
|
||||
activeDeadlineSeconds: 3600
|
||||
podSecurityContext:
|
||||
@ -343,8 +345,10 @@ pre-upgrade hook is produced to uninstall the addon.
|
||||
{{- include "addon.config.secret" (list $ctx $name $config) }}
|
||||
---
|
||||
{{- include "addon.job.install" (list $ctx $name $config) }}
|
||||
{{- if $config.generatePreDeleteHook }}
|
||||
---
|
||||
{{- include "addon.job.uninstall" (list $ctx $name "pre-delete" $config) }}
|
||||
{{- end }}
|
||||
{{- else if $ctx.Release.IsUpgrade }}
|
||||
{{- $secretName := include "addon.fullname" (list $ctx $name) | printf "%s-config" }}
|
||||
{{- if lookup "v1" "Secret" $ctx.Release.Namespace $secretName }}
|
||||
|
@ -53,8 +53,8 @@ template:
|
||||
- name: config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
{{- range $dep := $config.dependsOn }}
|
||||
- name: wait-for-{{ $dep }}
|
||||
{{- if $config.dependsOn }}
|
||||
- name: wait-for-dependencies
|
||||
image: {{ printf "%s:%s" $config.image.repository (default $ctx.Chart.AppVersion $config.image.tag) }}
|
||||
imagePullPolicy: {{ $config.image.pullPolicy }}
|
||||
securityContext: {{ toYaml $config.securityContext | nindent 10 }}
|
||||
@ -63,6 +63,7 @@ template:
|
||||
- -c
|
||||
- |
|
||||
set -ex
|
||||
{{- range $dep := $config.dependsOn }}
|
||||
{{- $labels := include "addon.job.selectorLabels" (list $ctx $dep "install") | fromYaml }}
|
||||
{{- range $i, $label := (keys $labels | sortAlpha) -}}
|
||||
{{- if $i }}
|
||||
@ -72,6 +73,7 @@ template:
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
kubectl wait --for=condition=Complete job -n {{ $ctx.Release.Namespace }} -l "$LABELS" --all --timeout=-1s
|
||||
{{- end }}
|
||||
resources: {{ toYaml $config.resources | nindent 10 }}
|
||||
{{- end }}
|
||||
{{- range $config.extraInitContainers }}
|
||||
@ -141,7 +143,11 @@ apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
{{- $checksum := include "addon.job.install.spec" . | sha256sum }}
|
||||
{{- $jobName := printf "%s-%s" (include "addon.job.name" (list $ctx $name "install")) (trunc 5 $checksum) }}
|
||||
{{-
|
||||
$jobName := printf "%s-%s"
|
||||
(include "addon.job.name" (list $ctx $name "install") | trunc 57 | trimSuffix "-")
|
||||
(trunc 5 $checksum)
|
||||
}}
|
||||
name: {{ $jobName }}
|
||||
labels: {{ include "addon.job.labels" (list $ctx $name "install") | nindent 4 }}
|
||||
spec:
|
||||
|
@ -90,6 +90,9 @@ extraInitContainers:
|
||||
- "-1s"
|
||||
resources: {{ toYaml $ctx.Values.jobDefaults.resources | nindent 6 }}
|
||||
{{- end }}
|
||||
# If the addons are deployed as part of a Cluster API cluster, suppress the pre-delete hooks
|
||||
# If the cluster no longer exists, then neither do the addons!
|
||||
generatePreDeleteHook: {{ not $ctx.Values.clusterApi | toYaml }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
@ -118,6 +121,10 @@ Determines if an addon is enabled given the name.
|
||||
{{- $ctx.Values.metricsServer.enabled | toYaml -}}
|
||||
{{- else if eq $name "monitoring" -}}
|
||||
{{- $ctx.Values.monitoring.enabled | toYaml -}}
|
||||
{{- else if eq $name "nfd" -}}
|
||||
{{- $ctx.Values.nfd.enabled | toYaml -}}
|
||||
{{- else if eq $name "nvidia-gpu-operator" -}}
|
||||
{{- $ctx.Values.nvidiaGPUOperator.enabled | toYaml -}}
|
||||
{{- else if hasKey $ctx.Values.extraAddons $name -}}
|
||||
{{- dig $name "enabled" true $ctx.Values.extraAddons | toYaml -}}
|
||||
{{- else -}}
|
||||
@ -139,6 +146,8 @@ value:
|
||||
{{- else if eq $name "monitoring" }}
|
||||
- storage
|
||||
- ingress
|
||||
{{- else if eq $name "nvidia-gpu-operator" }}
|
||||
- nfd
|
||||
{{- else if hasKey $ctx.Values.extraAddons $name }}
|
||||
{{- dig $name "dependsOn" list $ctx.Values.extraAddons | toYaml | nindent 2 }}
|
||||
{{- else }}
|
||||
|
13
charts/cluster-addons/templates/nfd.yaml
Normal file
13
charts/cluster-addons/templates/nfd.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
{{- define "cluster-addons.nfd.config" -}}
|
||||
{{- include "cluster-addons.job.defaults" (list . "nfd") }}
|
||||
installType: helm
|
||||
helm: {{ omit .Values.nfd "enabled" | toYaml | nindent 2 }}
|
||||
{{- end }}
|
||||
|
||||
{{-
|
||||
include "addon.job" (list
|
||||
.
|
||||
"nfd"
|
||||
"cluster-addons.nfd.config"
|
||||
)
|
||||
}}
|
13
charts/cluster-addons/templates/nvidia-gpu-operator.yaml
Normal file
13
charts/cluster-addons/templates/nvidia-gpu-operator.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
{{- define "cluster-addons.nvidia-gpu-operator.config" -}}
|
||||
{{- include "cluster-addons.job.defaults" (list . "nvidia-gpu-operator") }}
|
||||
installType: helm
|
||||
helm: {{ omit .Values.nvidiaGPUOperator "enabled" | toYaml | nindent 2 }}
|
||||
{{- end }}
|
||||
|
||||
{{-
|
||||
include "addon.job" (list
|
||||
.
|
||||
"nvidia-gpu-operator"
|
||||
"cluster-addons.nvidia-gpu-operator.config"
|
||||
)
|
||||
}}
|
102
charts/cluster-addons/templates/purge-cloud-resources.yaml
Normal file
102
charts/cluster-addons/templates/purge-cloud-resources.yaml
Normal file
@ -0,0 +1,102 @@
|
||||
{{- if and .Values.clusterApi .Values.openstack.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "cluster-addons.fullname" .) "purge-cloud-resources" | trunc 63 | trimSuffix "-" }}
|
||||
labels: {{ include "cluster-addons.labels" . | nindent 4 }}
|
||||
annotations:
|
||||
helm.sh/hook: pre-delete
|
||||
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
|
||||
spec:
|
||||
backoffLimit: {{ .Values.jobDefaults.backoffLimit }}
|
||||
activeDeadlineSeconds: {{ .Values.jobDefaults.activeDeadlineSeconds }}
|
||||
template:
|
||||
metadata:
|
||||
labels: {{ include "cluster-addons.selectorLabels" . | nindent 8 }}
|
||||
spec:
|
||||
{{- with .Values.jobDefaults.imagePullSecrets }}
|
||||
imagePullSecrets: {{ toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
securityContext: {{ toYaml .Values.jobDefaults.podSecurityContext | nindent 8 }}
|
||||
restartPolicy: OnFailure
|
||||
serviceAccountName: {{ tpl .Values.serviceAccount.name . }}
|
||||
{{- if .Values.kubeconfigSecret.name }}
|
||||
# Use an init container to install the kubeconfig file from the specified secret if required
|
||||
# We don't use a regular volume for this because we need the hook not to block in the case
|
||||
# where the secret is not available
|
||||
initContainers:
|
||||
- name: install-kubeconfig
|
||||
image: {{
|
||||
printf "%s:%s"
|
||||
.Values.jobDefaults.image.repository
|
||||
(default .Chart.AppVersion .Values.jobDefaults.image.tag)
|
||||
}}
|
||||
imagePullPolicy: {{ .Values.jobDefaults.image.pullPolicy }}
|
||||
securityContext: {{ toYaml .Values.jobDefaults.securityContext | nindent 12 }}
|
||||
args:
|
||||
- /bin/bash
|
||||
- -c
|
||||
- |
|
||||
set -ex
|
||||
get_kubeconfig() {
|
||||
kubectl get secret {{ tpl .Values.kubeconfigSecret.name . }} \
|
||||
-n {{ .Release.Namespace }} \
|
||||
-o go-template='{{ printf "{{ index .data \"%s\" | base64decode }}" .Values.kubeconfigSecret.key }}' \
|
||||
> /config/auth/kubeconfig
|
||||
}
|
||||
get_kubeconfig || true
|
||||
resources: {{ toYaml .Values.jobDefaults.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: kubeconfig
|
||||
mountPath: /config/auth
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: purge-cloud-resources
|
||||
image: {{
|
||||
printf "%s:%s"
|
||||
.Values.jobDefaults.image.repository
|
||||
(default .Chart.AppVersion .Values.jobDefaults.image.tag)
|
||||
}}
|
||||
imagePullPolicy: {{ .Values.jobDefaults.image.pullPolicy }}
|
||||
securityContext: {{ toYaml .Values.jobDefaults.securityContext | nindent 12 }}
|
||||
# We can only make a best effort to delete the resources as we don't want the hook to block
|
||||
# So we bail without an error if the kubeconfig doesn't exist, the API is not reachable or
|
||||
# the deletion fails
|
||||
args:
|
||||
- /bin/bash
|
||||
- -c
|
||||
- |
|
||||
set -x
|
||||
{{- if .Values.kubeconfigSecret.name }}
|
||||
test -f "$KUBECONFIG" || exit 0
|
||||
{{- end }}
|
||||
kubectl version || exit 0
|
||||
for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}'); do
|
||||
for svc in $(kubectl get svc -n "$ns" -o jsonpath='{.items[?(@.spec.type == "LoadBalancer")].metadata.name}'); do
|
||||
kubectl delete svc "$svc" -n "$ns" || true
|
||||
done
|
||||
done
|
||||
{{- if .Values.kubeconfigSecret.name }}
|
||||
env:
|
||||
- name: KUBECONFIG
|
||||
value: /config/auth/kubeconfig
|
||||
{{- end }}
|
||||
resources: {{ toYaml .Values.jobDefaults.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: kubeconfig
|
||||
mountPath: /config/auth
|
||||
readOnly: true
|
||||
hostNetwork: {{ .Values.jobDefaults.hostNetwork }}
|
||||
{{- with .Values.jobDefaults.nodeSelector }}
|
||||
nodeSelector: {{ toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.jobDefaults.affinity }}
|
||||
affinity: {{ toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.jobDefaults.tolerations }}
|
||||
tolerations: {{ toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: kubeconfig
|
||||
emptyDir: {}
|
||||
{{- end }}
|
@ -47,25 +47,24 @@ serviceAccount:
|
||||
# This is treated as a template during rendering
|
||||
name: "{{ include \"cluster-addons.fullname\" . }}-deployer"
|
||||
|
||||
# Defaults for job settings
|
||||
# In all cases, the defaults for the version of the addons chart in use are used
|
||||
# See the values for the addons chart for details
|
||||
# Default settings for jobs
|
||||
jobDefaults:
|
||||
image:
|
||||
repository: ghcr.io/stackhpc/k8s-utils
|
||||
tag: # Defaults to chart appVersion if not given
|
||||
pullPolicy: IfNotPresent
|
||||
imagePullSecrets: []
|
||||
backoffLimit: 1000
|
||||
activeDeadlineSeconds: 3600
|
||||
podSecurityContext:
|
||||
runAsNonRoot: true
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
resources: {}
|
||||
# imagePullSecrets:
|
||||
# backoffLimit:
|
||||
# activeDeadlineSeconds:
|
||||
# podSecurityContext:
|
||||
# hostNetwork:
|
||||
# tolerations:
|
||||
# nodeSelector:
|
||||
# affinity:
|
||||
hostNetwork: false
|
||||
tolerations: []
|
||||
nodeSelector: {}
|
||||
affinity: {}
|
||||
|
||||
# The available categories for dependencies and the addons that belong to them
|
||||
categories:
|
||||
@ -266,5 +265,65 @@ monitoring:
|
||||
namespace: monitoring-system
|
||||
values: {}
|
||||
|
||||
# Settings for node feature discovery (NFD)
|
||||
nfd:
|
||||
# Indicates if node feature discovery should be enabled
|
||||
enabled: true
|
||||
chart:
|
||||
repo: https://kubernetes-sigs.github.io/node-feature-discovery/charts
|
||||
name: node-feature-discovery
|
||||
version: 0.10.1
|
||||
release:
|
||||
namespace: node-feature-discovery
|
||||
values:
|
||||
master:
|
||||
extraLabelNs:
|
||||
- nvidia.com
|
||||
worker:
|
||||
# Allow the NFD pods to be scheduled on master nodes
|
||||
tolerations:
|
||||
- key: "node-role.kubernetes.io/master"
|
||||
operator: "Equal"
|
||||
value: ""
|
||||
effect: "NoSchedule"
|
||||
- key: "nvidia.com/gpu"
|
||||
operator: "Equal"
|
||||
value: "present"
|
||||
effect: "NoSchedule"
|
||||
# We want to be able to identify nodes with high-performance hardware
|
||||
# So the whitelisted device classes are:
|
||||
# 02 - Network Controllers (e.g. Ethernet, Infiniband)
|
||||
# 03 - Display Controllers (e.g. GPUs)
|
||||
# 0b40 - Co-processors
|
||||
# 12 - Processing Accelerators (e.g. specialised AI inference chips)
|
||||
config:
|
||||
sources:
|
||||
pci:
|
||||
deviceClassWhitelist:
|
||||
- "02"
|
||||
- "03"
|
||||
- "0b40"
|
||||
- "12"
|
||||
deviceLabelFields:
|
||||
- vendor
|
||||
|
||||
# Settings for the NVIDIA GPU operator
|
||||
nvidiaGPUOperator:
|
||||
# Indicates if the NVIDIA GPU operator should be enabled
|
||||
# Note that because it uses node feature discovery to run only on nodes
|
||||
# with an NVIDIA GPU available, the overhead of enabling this on clusters
|
||||
# that do not need it now but may need it in the future is low
|
||||
enabled: true
|
||||
chart:
|
||||
repo: https://nvidia.github.io/gpu-operator
|
||||
name: gpu-operator
|
||||
version: v1.9.1
|
||||
release:
|
||||
namespace: gpu-operator
|
||||
values:
|
||||
# Use the shared NFD
|
||||
nfd:
|
||||
enabled: false
|
||||
|
||||
# Map of extra addons in the form "component name" -> "addon spec"
|
||||
extraAddons: {}
|
||||
|
@ -4,10 +4,11 @@ ENV UTILS_UID 1001
|
||||
ENV UTILS_GID 1001
|
||||
ENV UTILS_USER utils
|
||||
ENV UTILS_GROUP utils
|
||||
ENV UTILS_HOME /home/utils
|
||||
RUN groupadd --gid $UTILS_GID $UTILS_GROUP && \
|
||||
useradd \
|
||||
--no-create-home \
|
||||
--no-user-group \
|
||||
--home-dir $UTILS_HOME \
|
||||
--create-home \
|
||||
--gid $UTILS_GID \
|
||||
--shell /sbin/nologin \
|
||||
--uid $UTILS_UID \
|
||||
@ -100,5 +101,6 @@ ENV KUBECTL_VN_LATEST v1.23
|
||||
COPY ./bin/* /usr/bin/
|
||||
|
||||
USER $UTILS_UID
|
||||
WORKDIR $UTILS_HOME
|
||||
ENTRYPOINT ["tini", "-g", "--"]
|
||||
CMD ["bash"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user