From 711ef3f7356a43d30fe08b683769eafac6ea7a3a Mon Sep 17 00:00:00 2001 From: Sergiy Markin Date: Thu, 30 Jan 2025 18:12:47 +0000 Subject: [PATCH] Ceph rook gates improvement This patchset fixes the instability of the ceph-rook gates by adding extra nodes to the cluster. Also improved ceph deployment process monitoring. Change-Id: I405e501afc15f3974a047475a2b463e7f254da66 --- tools/deployment/ceph/ceph-rook.sh | 36 +++++++++++++++++++++++++----- tools/deployment/common/sleep.sh | 7 ++++++ zuul.d/jobs.yaml | 5 +++-- 3 files changed, 40 insertions(+), 8 deletions(-) create mode 100755 tools/deployment/common/sleep.sh diff --git a/tools/deployment/ceph/ceph-rook.sh b/tools/deployment/ceph/ceph-rook.sh index c7564f119..0e5d45c93 100755 --- a/tools/deployment/ceph/ceph-rook.sh +++ b/tools/deployment/ceph/ceph-rook.sh @@ -394,10 +394,10 @@ cephClusterSpec: continueUpgradeAfterChecksEvenIfNotHealthy: false waitTimeoutForHealthyOSDInMinutes: 10 mon: - count: 1 + count: 3 allowMultiplePerNode: false mgr: - count: 1 + count: 3 allowMultiplePerNode: false modules: - name: pg_autoscaler @@ -636,6 +636,28 @@ EOF helm upgrade --install --create-namespace --namespace ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster --version ${ROOK_RELEASE} -f /tmp/ceph.yaml +TOOLS_POD=$(kubectl get pods \ + --namespace=ceph \ + --selector="app=rook-ceph-tools" \ + --no-headers | awk '{ print $1; exit }') + +helm osh wait-for-pods rook-ceph + +kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s + +# Wait for all monitor pods to be ready +MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }') +for MON_POD in $MON_PODS; do + if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then + kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s + else + echo "Pod $MON_POD not found, skipping..." + fi +done + +echo "=========== CEPH K8S PODS LIST ============" +kubectl get pods -n rook-ceph -o wide +kubectl get pods -n ceph -o wide #NOTE: Wait for deploy RGW_POD=$(kubectl get pods \ --namespace=ceph \ @@ -644,6 +666,12 @@ RGW_POD=$(kubectl get pods \ while [[ -z "${RGW_POD}" ]] do sleep 5 + echo "=========== CEPH STATUS ============" + kubectl exec -n ceph ${TOOLS_POD} -- ceph -s + echo "=========== CEPH OSD POOL LIST ============" + kubectl exec -n ceph ${TOOLS_POD} -- ceph osd pool ls + echo "=========== CEPH K8S PODS LIST ============" + kubectl get pods -n ceph -o wide RGW_POD=$(kubectl get pods \ --namespace=ceph \ --selector="app=rook-ceph-rgw" \ @@ -652,8 +680,4 @@ done helm osh wait-for-pods ceph #NOTE: Validate deploy -TOOLS_POD=$(kubectl get pods \ - --namespace=ceph \ - --selector="app=rook-ceph-tools" \ - --no-headers | awk '{ print $1; exit }') kubectl exec -n ceph ${TOOLS_POD} -- ceph -s diff --git a/tools/deployment/common/sleep.sh b/tools/deployment/common/sleep.sh new file mode 100755 index 000000000..cb8fe16b4 --- /dev/null +++ b/tools/deployment/common/sleep.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -ex + +while true; do + echo "Sleeping for 100 seconds..." +done diff --git a/zuul.d/jobs.yaml b/zuul.d/jobs.yaml index 33a12b629..206b67a30 100644 --- a/zuul.d/jobs.yaml +++ b/zuul.d/jobs.yaml @@ -120,7 +120,7 @@ - job: name: openstack-helm-infra-logging parent: openstack-helm-infra-deploy - nodeset: openstack-helm-3nodes-ubuntu_jammy + nodeset: openstack-helm-5nodes-ubuntu_jammy vars: osh_params: openstack_release: "2024.1" @@ -353,8 +353,9 @@ name: openstack-helm-infra-cinder-2024-1-ubuntu_jammy description: | This job uses Rook for managing Ceph cluster. - The job is run on 3 nodes. + The job is run on 5 nodes. parent: openstack-helm-cinder-2024-1-ubuntu_jammy + nodeset: openstack-helm-5nodes-ubuntu_jammy files: - ^helm-toolkit/.* - ^roles/.*