From 24d0656167008954f00ce0fd7aa8d86f047658c7 Mon Sep 17 00:00:00 2001 From: Tee Ngo Date: Thu, 13 Jun 2019 22:46:03 -0400 Subject: [PATCH] Add pods wait time to initial bootstrap play In latest loads that have kernel update among other code changes to various StarlingX repos, it is observed that not all kube-system pods get started before the host becomes online whereas they consistently did in the same slow lab in an older load. As a result, the bootstrap playbook often fails in this slow lab toward the end where it verifies kube-system pods readiness. This commit is a follow-up of commit 97181aa756854800d40db3f6099ec31541b47a88. In this commit, a 30 second pause is applied to initial play to ensure all pods have been started before executing the task that waits for them to become ready. The total wait time for replay remains unchanged at 60 seconds. Tests: Play and replay the bootstrap playbook locally on slow hardware. Closes-Bug: 1831664 Change-Id: I525c7771eafad2b9e79dd89e985696fb16bb5b24 Signed-off-by: Tee Ngo --- .../tasks/bringup_kubemaster.yml | 2 +- .../roles/bringup-essential-services/tasks/main.yml | 8 ++------ .../playbooks/bootstrap/roles/prepare-env/tasks/main.yml | 2 ++ 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/bringup_kubemaster.yml b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/bringup_kubemaster.yml index 7587b701a..847b9bf65 100644 --- a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/bringup_kubemaster.yml +++ b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/bringup_kubemaster.yml @@ -15,7 +15,7 @@ # - Prepare admin.conf # - Set k8s environment variable for new shell # - Prepare Calico config and activate Calico networking -# - Precare Multus config and activate Multus networking +# - Prepare Multus config and activate Multus networking # - Prepare SRIOV config and activate SRIOV networking # - Prepare SRIOV device plugin config and activate SRIOV device plugin # - Restrict coredns to master node and set anti-affnity (duplex system) diff --git a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/main.yml b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/main.yml index b0642407f..80582fd91 100644 --- a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/main.yml +++ b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/bringup-essential-services/tasks/main.yml @@ -65,13 +65,9 @@ until: online_check.rc == 0 retries: 10 - # Don't need to run this task for initial play as it will take a while to pull - # Armada image and additional time to wait for controller-0 to become online - # during which time kube-system pods are all started. - - name: Wait for 60 seconds to ensure kube-system pods are all started + - name: Wait for {{ pods_wait_time }} seconds to ensure kube-system pods are all started wait_for: - timeout: 60 - when: restart_services + timeout: "{{ pods_wait_time }}" - name: Start parallel tasks to wait for Kubernetes component, Networking and Tiller pods to reach ready state command: kubectl --kubeconfig=/etc/kubernetes/admin.conf wait --namespace=kube-system --for=condition=Ready pods --selector {{ item }} --timeout=30s diff --git a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/prepare-env/tasks/main.yml b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/prepare-env/tasks/main.yml index 72057b7c2..ea7a27302 100644 --- a/playbookconfig/playbookconfig/playbooks/bootstrap/roles/prepare-env/tasks/main.yml +++ b/playbookconfig/playbookconfig/playbooks/bootstrap/roles/prepare-env/tasks/main.yml @@ -190,6 +190,7 @@ derived_network_params: place_holder: place_holder ansible_remote_tmp: "{{ ansible_remote_tmp | default(lookup('ini', 'remote_tmp section=defaults file={{ playbook_dir }}/ansible.cfg')) }}" + pods_wait_time: "{{ pods_wait_time | default(30) }}" - name: Turn on use_docker_proxy flag set_fact: @@ -358,6 +359,7 @@ - name: Turn on restart services flag if management/oam/cluster network or docker config is changed set_fact: restart_services: true + pods_wait_time: "{{ pods_wait_time|int + 30 }}" when: reconfigure_endpoints or docker_config_update or (prev_cluster_host_subnet != cluster_host_subnet) or