Ceph rook gates improvement

This patchset fixes the instability of the
ceph-rook gates by adding extra nodes to the
cluster.

Also improved ceph deployment process monitoring.

Change-Id: I405e501afc15f3974a047475a2b463e7f254da66
This commit is contained in:
Sergiy Markin 2025-01-30 18:12:47 +00:00
parent fe95c4d1cf
commit 711ef3f735
3 changed files with 40 additions and 8 deletions

View File

@ -394,10 +394,10 @@ cephClusterSpec:
continueUpgradeAfterChecksEvenIfNotHealthy: false
waitTimeoutForHealthyOSDInMinutes: 10
mon:
count: 1
count: 3
allowMultiplePerNode: false
mgr:
count: 1
count: 3
allowMultiplePerNode: false
modules:
- name: pg_autoscaler
@ -636,6 +636,28 @@ EOF
helm upgrade --install --create-namespace --namespace ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster --version ${ROOK_RELEASE} -f /tmp/ceph.yaml
TOOLS_POD=$(kubectl get pods \
--namespace=ceph \
--selector="app=rook-ceph-tools" \
--no-headers | awk '{ print $1; exit }')
helm osh wait-for-pods rook-ceph
kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s
# Wait for all monitor pods to be ready
MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }')
for MON_POD in $MON_PODS; do
if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then
kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s
else
echo "Pod $MON_POD not found, skipping..."
fi
done
echo "=========== CEPH K8S PODS LIST ============"
kubectl get pods -n rook-ceph -o wide
kubectl get pods -n ceph -o wide
#NOTE: Wait for deploy
RGW_POD=$(kubectl get pods \
--namespace=ceph \
@ -644,6 +666,12 @@ RGW_POD=$(kubectl get pods \
while [[ -z "${RGW_POD}" ]]
do
sleep 5
echo "=========== CEPH STATUS ============"
kubectl exec -n ceph ${TOOLS_POD} -- ceph -s
echo "=========== CEPH OSD POOL LIST ============"
kubectl exec -n ceph ${TOOLS_POD} -- ceph osd pool ls
echo "=========== CEPH K8S PODS LIST ============"
kubectl get pods -n ceph -o wide
RGW_POD=$(kubectl get pods \
--namespace=ceph \
--selector="app=rook-ceph-rgw" \
@ -652,8 +680,4 @@ done
helm osh wait-for-pods ceph
#NOTE: Validate deploy
TOOLS_POD=$(kubectl get pods \
--namespace=ceph \
--selector="app=rook-ceph-tools" \
--no-headers | awk '{ print $1; exit }')
kubectl exec -n ceph ${TOOLS_POD} -- ceph -s

View File

@ -0,0 +1,7 @@
#!/bin/bash
set -ex
while true; do
echo "Sleeping for 100 seconds..."
done

View File

@ -120,7 +120,7 @@
- job:
name: openstack-helm-infra-logging
parent: openstack-helm-infra-deploy
nodeset: openstack-helm-3nodes-ubuntu_jammy
nodeset: openstack-helm-5nodes-ubuntu_jammy
vars:
osh_params:
openstack_release: "2024.1"
@ -353,8 +353,9 @@
name: openstack-helm-infra-cinder-2024-1-ubuntu_jammy
description: |
This job uses Rook for managing Ceph cluster.
The job is run on 3 nodes.
The job is run on 5 nodes.
parent: openstack-helm-cinder-2024-1-ubuntu_jammy
nodeset: openstack-helm-5nodes-ubuntu_jammy
files:
- ^helm-toolkit/.*
- ^roles/.*