Ceph rook gates improvement

This patchset fixes the instability of the
ceph-rook gates by adding extra nodes to the
cluster.

Also improved ceph deployment process monitoring.

Change-Id: I405e501afc15f3974a047475a2b463e7f254da66
This commit is contained in:
Sergiy Markin 2025-01-30 18:12:47 +00:00
parent fe95c4d1cf
commit 711ef3f735
3 changed files with 40 additions and 8 deletions

View File

@ -394,10 +394,10 @@ cephClusterSpec:
continueUpgradeAfterChecksEvenIfNotHealthy: false continueUpgradeAfterChecksEvenIfNotHealthy: false
waitTimeoutForHealthyOSDInMinutes: 10 waitTimeoutForHealthyOSDInMinutes: 10
mon: mon:
count: 1 count: 3
allowMultiplePerNode: false allowMultiplePerNode: false
mgr: mgr:
count: 1 count: 3
allowMultiplePerNode: false allowMultiplePerNode: false
modules: modules:
- name: pg_autoscaler - name: pg_autoscaler
@ -636,6 +636,28 @@ EOF
helm upgrade --install --create-namespace --namespace ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster --version ${ROOK_RELEASE} -f /tmp/ceph.yaml helm upgrade --install --create-namespace --namespace ceph rook-ceph-cluster --set operatorNamespace=rook-ceph rook-release/rook-ceph-cluster --version ${ROOK_RELEASE} -f /tmp/ceph.yaml
TOOLS_POD=$(kubectl get pods \
--namespace=ceph \
--selector="app=rook-ceph-tools" \
--no-headers | awk '{ print $1; exit }')
helm osh wait-for-pods rook-ceph
kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s
# Wait for all monitor pods to be ready
MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }')
for MON_POD in $MON_PODS; do
if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then
kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s
else
echo "Pod $MON_POD not found, skipping..."
fi
done
echo "=========== CEPH K8S PODS LIST ============"
kubectl get pods -n rook-ceph -o wide
kubectl get pods -n ceph -o wide
#NOTE: Wait for deploy #NOTE: Wait for deploy
RGW_POD=$(kubectl get pods \ RGW_POD=$(kubectl get pods \
--namespace=ceph \ --namespace=ceph \
@ -644,6 +666,12 @@ RGW_POD=$(kubectl get pods \
while [[ -z "${RGW_POD}" ]] while [[ -z "${RGW_POD}" ]]
do do
sleep 5 sleep 5
echo "=========== CEPH STATUS ============"
kubectl exec -n ceph ${TOOLS_POD} -- ceph -s
echo "=========== CEPH OSD POOL LIST ============"
kubectl exec -n ceph ${TOOLS_POD} -- ceph osd pool ls
echo "=========== CEPH K8S PODS LIST ============"
kubectl get pods -n ceph -o wide
RGW_POD=$(kubectl get pods \ RGW_POD=$(kubectl get pods \
--namespace=ceph \ --namespace=ceph \
--selector="app=rook-ceph-rgw" \ --selector="app=rook-ceph-rgw" \
@ -652,8 +680,4 @@ done
helm osh wait-for-pods ceph helm osh wait-for-pods ceph
#NOTE: Validate deploy #NOTE: Validate deploy
TOOLS_POD=$(kubectl get pods \
--namespace=ceph \
--selector="app=rook-ceph-tools" \
--no-headers | awk '{ print $1; exit }')
kubectl exec -n ceph ${TOOLS_POD} -- ceph -s kubectl exec -n ceph ${TOOLS_POD} -- ceph -s

View File

@ -0,0 +1,7 @@
#!/bin/bash
set -ex
while true; do
echo "Sleeping for 100 seconds..."
done

View File

@ -120,7 +120,7 @@
- job: - job:
name: openstack-helm-infra-logging name: openstack-helm-infra-logging
parent: openstack-helm-infra-deploy parent: openstack-helm-infra-deploy
nodeset: openstack-helm-3nodes-ubuntu_jammy nodeset: openstack-helm-5nodes-ubuntu_jammy
vars: vars:
osh_params: osh_params:
openstack_release: "2024.1" openstack_release: "2024.1"
@ -353,8 +353,9 @@
name: openstack-helm-infra-cinder-2024-1-ubuntu_jammy name: openstack-helm-infra-cinder-2024-1-ubuntu_jammy
description: | description: |
This job uses Rook for managing Ceph cluster. This job uses Rook for managing Ceph cluster.
The job is run on 3 nodes. The job is run on 5 nodes.
parent: openstack-helm-cinder-2024-1-ubuntu_jammy parent: openstack-helm-cinder-2024-1-ubuntu_jammy
nodeset: openstack-helm-5nodes-ubuntu_jammy
files: files:
- ^helm-toolkit/.* - ^helm-toolkit/.*
- ^roles/.* - ^roles/.*