Ceph rook gates improvement

This patchset fixes the instability of the ceph-rook gates by adding extra nodes to the cluster. Also improved ceph deployment process monitoring. Change-Id: I405e501afc15f3974a047475a2b463e7f254da66
2025-01-30 18:12:47 +00:00 · 2025-01-30 18:12:47 +00:00 · 711ef3f735
commit 711ef3f735
parent fe95c4d1cf
3 changed files with 40 additions and 8 deletions
--- a/tools/deployment/ceph/ceph-rook.sh
+++ b/tools/deployment/ceph/ceph-rook.sh
@ -394,10 +394,10 @@ cephClusterSpec:
  continueUpgradeAfterChecksEvenIfNotHealthy: false
  waitTimeoutForHealthyOSDInMinutes: 10
  mon:
-    count: 1
+    count: 3
    allowMultiplePerNode: false
  mgr:
-    count: 1
+    count: 3
    allowMultiplePerNode: false
    modules:
      - name: pg_autoscaler
@ -636,6 +636,28 @@ EOF

 helm upgrade --install --create-namespace --namespace ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster --version ${ROOK_RELEASE} -f /tmp/ceph.yaml

+TOOLS_POD=$(kubectl get pods \
+  --namespace=ceph \
+  --selector="app=rook-ceph-tools" \
+  --no-headers | awk '{ print $1; exit }')
+
+helm osh wait-for-pods rook-ceph
+
+kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s
+
+# Wait for all monitor pods to be ready
+MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }')
+for MON_POD in $MON_PODS; do
+  if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then
+    kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s
+  else
+    echo "Pod $MON_POD not found, skipping..."
+  fi
+done
+
+echo "=========== CEPH K8S PODS LIST ============"
+kubectl get pods -n rook-ceph -o wide
+kubectl get pods -n ceph -o wide
 #NOTE: Wait for deploy
 RGW_POD=$(kubectl get pods \
  --namespace=ceph \
@ -644,6 +666,12 @@ RGW_POD=$(kubectl get pods \
 while [[ -z "${RGW_POD}" ]]
 do
  sleep 5
+  echo "=========== CEPH STATUS ============"
+  kubectl exec -n ceph ${TOOLS_POD} -- ceph -s
+  echo "=========== CEPH OSD POOL LIST ============"
+  kubectl exec -n ceph ${TOOLS_POD} -- ceph osd pool ls
+  echo "=========== CEPH K8S PODS LIST ============"
+  kubectl get pods -n ceph -o wide
  RGW_POD=$(kubectl get pods \
    --namespace=ceph \
    --selector="app=rook-ceph-rgw" \
@ -652,8 +680,4 @@ done
 helm osh wait-for-pods ceph

 #NOTE: Validate deploy
-TOOLS_POD=$(kubectl get pods \
-  --namespace=ceph \
-  --selector="app=rook-ceph-tools" \
-  --no-headers | awk '{ print $1; exit }')
 kubectl exec -n ceph ${TOOLS_POD} -- ceph -s
--- a/tools/deployment/common/sleep.sh
+++ b/tools/deployment/common/sleep.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -ex
+
+while true; do
+    echo "Sleeping for 100 seconds..."
+done
--- a/zuul.d/jobs.yaml
+++ b/zuul.d/jobs.yaml
@ -120,7 +120,7 @@
 - job:
    name: openstack-helm-infra-logging
    parent: openstack-helm-infra-deploy
-    nodeset: openstack-helm-3nodes-ubuntu_jammy
+    nodeset: openstack-helm-5nodes-ubuntu_jammy
    vars:
      osh_params:
        openstack_release: "2024.1"
@ -353,8 +353,9 @@
    name: openstack-helm-infra-cinder-2024-1-ubuntu_jammy
    description: |
      This job uses Rook for managing Ceph cluster.
-      The job is run on 3 nodes.
+      The job is run on 5 nodes.
    parent: openstack-helm-cinder-2024-1-ubuntu_jammy
+    nodeset: openstack-helm-5nodes-ubuntu_jammy
    files:
      - ^helm-toolkit/.*
      - ^roles/.*