add test remove all grastate.dat for OSP18 /refactor
Change-Id: I041821ab4229b9f20b16def5ca95170908785e5a
This commit is contained in:
parent
7bfd11555a
commit
35c34ea980
@ -32,5 +32,8 @@ skip_if_podified = _topology.skip_if_podified
|
|||||||
get_dataplane_ssh_keypair = _openshift.get_dataplane_ssh_keypair
|
get_dataplane_ssh_keypair = _openshift.get_dataplane_ssh_keypair
|
||||||
has_podified_cp = _openshift.has_podified_cp
|
has_podified_cp = _openshift.has_podified_cp
|
||||||
get_ovndbcluter = _openshift.get_ovndbcluter
|
get_ovndbcluter = _openshift.get_ovndbcluter
|
||||||
|
execute_in_pod = _openshift.execute_in_pod
|
||||||
|
get_openstack_config_secret = _openshift.get_openstack_config_secret
|
||||||
|
get_pods = _openshift.get_pods
|
||||||
|
|
||||||
get_container_runtime_name = containers.get_container_runtime_name
|
get_container_runtime_name = containers.get_container_runtime_name
|
||||||
|
@ -27,6 +27,7 @@ LOG = log.getLogger(__name__)
|
|||||||
OSP_CONTROLPLANE = 'openstackcontrolplane'
|
OSP_CONTROLPLANE = 'openstackcontrolplane'
|
||||||
OSP_DP_NODESET = 'openstackdataplanenodeset'
|
OSP_DP_NODESET = 'openstackdataplanenodeset'
|
||||||
DP_SSH_SECRET_NAME = 'secret/dataplane-ansible-ssh-private-key-secret'
|
DP_SSH_SECRET_NAME = 'secret/dataplane-ansible-ssh-private-key-secret'
|
||||||
|
OSP_CONFIG_SECRET_NAME = 'secret/openstack-config-secret'
|
||||||
OSP_BM_HOST = 'baremetalhost.metal3.io'
|
OSP_BM_HOST = 'baremetalhost.metal3.io'
|
||||||
OSP_BM_CRD = 'baremetalhosts.metal3.io'
|
OSP_BM_CRD = 'baremetalhosts.metal3.io'
|
||||||
OCP_WORKERS = 'nodes'
|
OCP_WORKERS = 'nodes'
|
||||||
@ -149,6 +150,17 @@ def get_dataplane_ssh_keypair():
|
|||||||
return private_key, public_key
|
return private_key, public_key
|
||||||
|
|
||||||
|
|
||||||
|
def get_openstack_config_secret():
|
||||||
|
with oc.project(CONF.tobiko.podified.osp_project):
|
||||||
|
try:
|
||||||
|
secret_object = oc.selector(OSP_CONFIG_SECRET_NAME).object()
|
||||||
|
except oc.OpenShiftPythonException as err:
|
||||||
|
LOG.info("Error while trying to get openstack config secret "
|
||||||
|
f"{OSP_CONFIG_SECRET_NAME} from Openshift. Error: {err}")
|
||||||
|
return
|
||||||
|
return secret_object.as_dict()
|
||||||
|
|
||||||
|
|
||||||
def list_edpm_nodes():
|
def list_edpm_nodes():
|
||||||
nodes = []
|
nodes = []
|
||||||
with oc.project(CONF.tobiko.podified.osp_project):
|
with oc.project(CONF.tobiko.podified.osp_project):
|
||||||
@ -398,3 +410,9 @@ def _check_ping_results(pod):
|
|||||||
else:
|
else:
|
||||||
tobiko.fail("Failed to copy ping log files from the POD "
|
tobiko.fail("Failed to copy ping log files from the POD "
|
||||||
f"{pod.name()}. Error: {cp.err}")
|
f"{pod.name()}. Error: {cp.err}")
|
||||||
|
|
||||||
|
|
||||||
|
def execute_in_pod(pod_name, command, container_name=None):
|
||||||
|
with oc.project(CONF.tobiko.podified.osp_project):
|
||||||
|
return oc.selector(f'pod/{pod_name}').object().execute(
|
||||||
|
['sh', '-c', command], container_name=container_name)
|
||||||
|
@ -15,8 +15,9 @@
|
|||||||
# under the License.
|
# under the License.
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import functools
|
||||||
import re
|
import re
|
||||||
import time
|
import random
|
||||||
|
|
||||||
import openshift_client as oc
|
import openshift_client as oc
|
||||||
from oslo_log import log
|
from oslo_log import log
|
||||||
@ -24,115 +25,138 @@ from oslo_log import log
|
|||||||
import tobiko
|
import tobiko
|
||||||
from tobiko import config
|
from tobiko import config
|
||||||
from tobiko import podified
|
from tobiko import podified
|
||||||
# from tobiko.openstack import glance
|
from tobiko.openstack import keystone
|
||||||
# from tobiko.openstack import keystone
|
|
||||||
# from tobiko.openstack import neutron
|
|
||||||
# from tobiko.openstack import stacks
|
|
||||||
# from tobiko.openstack import tests
|
|
||||||
# from tobiko.openstack import topology
|
|
||||||
# from tobiko.tests.faults.ha import test_cloud_recovery
|
|
||||||
# from tobiko.shell import ping
|
|
||||||
# from tobiko.shell import sh
|
|
||||||
|
|
||||||
CONF = config.CONF
|
CONF = config.CONF
|
||||||
LOG = log.getLogger(__name__)
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
kill_galera = 'kill -9 $(pidof mysqld)'
|
||||||
|
rm_grastate = 'rm -rf /var/lib/mysql/grastate.dat'
|
||||||
|
galera_cluster_size = 'mysql -u root --password={passwd} -e \'SHOW STATUS ' \
|
||||||
|
'LIKE "wsrep_cluster_size"\''
|
||||||
|
check_bootstrap = """
|
||||||
|
ps -eo lstart,cmd | grep -v grep|
|
||||||
|
grep wsrep-cluster-address=gcomm://
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class GaleraBoostrapException(tobiko.TobikoException):
|
||||||
|
message = "Bootstrap has not been activated"
|
||||||
|
|
||||||
|
|
||||||
|
class DownException(tobiko.TobikoException):
|
||||||
|
message = "The resource is not down"
|
||||||
|
|
||||||
|
|
||||||
|
class RestoredException(tobiko.TobikoException):
|
||||||
|
message = "The resource is not restored"
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache()
|
||||||
|
def get_galera_pods_per_service(galera_service):
|
||||||
|
# the aim of this function is just to cache results and avoid sending
|
||||||
|
# oc requests every time
|
||||||
|
return podified.get_pods({'service': galera_service})
|
||||||
|
|
||||||
|
|
||||||
@podified.skip_if_not_podified
|
|
||||||
def kill_all_galera_services():
|
def kill_all_galera_services():
|
||||||
"""kill all galera processes,
|
"""kill all galera processes,
|
||||||
check in pacemaker it is down"""
|
check in pacemaker it is down"""
|
||||||
galera_pods_num = sum(
|
# get galera pods sorted into 2 different lists:
|
||||||
1 for node_name in oc.selector('nodes').qnames()
|
# one with 'cell-galera' an one without
|
||||||
for pod_obj in oc.get_pods_by_node(node_name)
|
for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
|
||||||
if 'cell1-galera' in pod_obj.fqname()
|
pods = get_galera_pods_per_service(galera_service)
|
||||||
)
|
kill_all_galera_pods(pods)
|
||||||
for i in range(galera_pods_num):
|
check_all_galera_cells_down(pods[0].name())
|
||||||
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\
|
verify_all_galera_cells_restored(pods)
|
||||||
.execute(['sh', '-c', 'kill -9 $(pidof mysqld)'],
|
|
||||||
container_name='galera')
|
|
||||||
LOG.info('kill galera cell-{}'.format(i))
|
|
||||||
|
|
||||||
retry = tobiko.retry(timeout=30, interval=5)
|
|
||||||
for _ in retry:
|
|
||||||
try:
|
|
||||||
# checks wsrep cluster size is now unavailable
|
|
||||||
result = oc.selector('pod/openstack-cell1-galera-0').object(
|
|
||||||
).execute(['sh', '-c', """mysql -u root --password=12345678
|
|
||||||
-e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""])
|
|
||||||
# Capture and filter the error output
|
|
||||||
error_output = result.err()
|
|
||||||
non_error_message = """
|
|
||||||
Defaulted container "galera" out of: galera,
|
|
||||||
mysql-bootstrap (init)\n"""
|
|
||||||
filtered_err_output = error_output.replace(non_error_message, '')
|
|
||||||
if not filtered_err_output.strip():
|
|
||||||
continue
|
|
||||||
except oc.OpenShiftPythonException:
|
|
||||||
LOG.info('all galera cells down')
|
|
||||||
break
|
|
||||||
time.sleep(60)
|
|
||||||
for _ in retry:
|
|
||||||
try:
|
|
||||||
if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector(
|
|
||||||
'pod/openstack-cell1-galera-0').object().execute(
|
|
||||||
['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW
|
|
||||||
STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera'
|
|
||||||
).out()).group(1)) == galera_pods_num:
|
|
||||||
LOG.info('all galera cells are restored')
|
|
||||||
return
|
|
||||||
except oc.OpenShiftPythonException:
|
|
||||||
continue
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
@podified.skip_if_not_podified
|
|
||||||
def remove_all_grastate_galera():
|
def remove_all_grastate_galera():
|
||||||
"""shut down galera properly,
|
"""shut down galera properly,
|
||||||
remove all grastate"""
|
remove all grastate"""
|
||||||
galera_pods_num = sum(
|
for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
|
||||||
1 for node_name in oc.selector('nodes').qnames()
|
pods = get_galera_pods_per_service(galera_service)
|
||||||
for pod_obj in oc.get_pods_by_node(node_name)
|
for pod in pods:
|
||||||
if 'cell1-galera' in pod_obj.fqname()
|
remove_grastate(pod.name())
|
||||||
)
|
# TODO: change kill to graceful stop/ scale down
|
||||||
for i in range(galera_pods_num):
|
kill_all_galera_pods(pods)
|
||||||
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\
|
check_all_galera_cells_down(pods[0].name())
|
||||||
.execute(['sh', '-c', 'rm -rf /var/lib/mysql/grastate.dat '],
|
verify_all_galera_cells_restored(pods)
|
||||||
container_name='galera')
|
|
||||||
LOG.info('delete grastate.dat cell-{}'.format(i))
|
|
||||||
for i in range(galera_pods_num):
|
def remove_one_grastate_galera():
|
||||||
oc.selector('pod/openstack-cell1-galera-{}'.format(i)).object()\
|
"""shut down galera properly,
|
||||||
.execute(['sh', '-c', 'kill -9 $(pidof mysqld)'],
|
delete /var/lib/mysql/grastate.dat in a random node,
|
||||||
container_name='galera')
|
check that bootstrap is done from a node with grastate"""
|
||||||
LOG.info('kill galera cell-{}'.format(i))
|
for galera_service in ('openstack-cell1-galera', 'openstack-galera'):
|
||||||
|
pods = get_galera_pods_per_service(galera_service)
|
||||||
|
random_pod_name = random.choice(pods).name()
|
||||||
|
remove_grastate(random_pod_name)
|
||||||
|
# TODO: change kill to graceful stop/ scale down
|
||||||
|
kill_all_galera_pods(pods)
|
||||||
|
check_all_galera_cells_down(pods[0].name())
|
||||||
|
verify_all_galera_cells_restored(pods)
|
||||||
|
# gcomm:// without args means that bootstrap is done from this node
|
||||||
|
bootstrap = podified.execute_in_pod(
|
||||||
|
random_pod_name, check_bootstrap, 'galera').out().strip()
|
||||||
|
if len(pods) > 1:
|
||||||
|
if re.search(r'wsrep-cluster-address=gcomm://(?:\s|$)', bootstrap
|
||||||
|
) is None:
|
||||||
|
raise GaleraBoostrapException()
|
||||||
|
elif re.search(r'wsrep-cluster-address=gcomm://', bootstrap) is None:
|
||||||
|
raise GaleraBoostrapException()
|
||||||
|
lastDate = re.findall(r"\w{,3}\s*\w{,3}\s*\d{,2}\s*\d{,2}:\d{,2}"
|
||||||
|
r":\d{,2}\s*\d{4}", bootstrap)[-1]
|
||||||
|
LOG.info(f'last boostrap required at {lastDate}')
|
||||||
|
|
||||||
|
|
||||||
|
def remove_grastate(pod_name):
|
||||||
|
podified.execute_in_pod(pod_name, rm_grastate, 'galera')
|
||||||
|
LOG.info(f'grastate.dat removed from {pod_name}')
|
||||||
|
|
||||||
|
|
||||||
|
def kill_all_galera_pods(galera_pods):
|
||||||
|
for pod in galera_pods:
|
||||||
|
podified.execute_in_pod(pod.name(), kill_galera, 'galera')
|
||||||
|
LOG.info(f'kill galera pod {pod}')
|
||||||
|
|
||||||
|
|
||||||
|
def check_all_galera_cells_down(pod_name):
|
||||||
|
pw = keystone.keystone_credentials().password
|
||||||
|
|
||||||
retry = tobiko.retry(timeout=30, interval=5)
|
retry = tobiko.retry(timeout=30, interval=5)
|
||||||
for _ in retry:
|
for _ in retry:
|
||||||
try:
|
try:
|
||||||
# checks wsrep cluster size is now unavailable
|
cluster_size = podified.execute_in_pod(
|
||||||
result = oc.selector('pod/openstack-cell1-galera-0').object(
|
pod_name, galera_cluster_size.format(passwd=pw), 'galera')
|
||||||
).execute(['sh', '-c', """mysql -u root --password=12345678
|
error_output = cluster_size.err()
|
||||||
-e 'SHOW STATUS LIKE "wsrep_cluster_size"'"""])
|
non_error_message = "Defaulted container \"galera\" out of:"\
|
||||||
# Capture and filter the error output
|
"galera, mysql-bootstrap (init)\n"
|
||||||
error_output = result.err()
|
|
||||||
non_error_message = """
|
|
||||||
Defaulted container "galera" out of: galera,
|
|
||||||
mysql-bootstrap (init)\n"""
|
|
||||||
filtered_err_output = error_output.replace(non_error_message, '')
|
filtered_err_output = error_output.replace(non_error_message, '')
|
||||||
if not filtered_err_output.strip():
|
if not filtered_err_output.strip():
|
||||||
continue
|
continue
|
||||||
except oc.OpenShiftPythonException:
|
except oc.OpenShiftPythonException:
|
||||||
LOG.info('all galera cells down')
|
LOG.info('all galera cells down')
|
||||||
break
|
return
|
||||||
time.sleep(60)
|
raise DownException()
|
||||||
|
|
||||||
|
|
||||||
|
def verify_all_galera_cells_restored(pods):
|
||||||
|
pw = keystone.keystone_credentials().password
|
||||||
|
|
||||||
|
retry = tobiko.retry(timeout=60, interval=10)
|
||||||
for _ in retry:
|
for _ in retry:
|
||||||
|
pod_name = pods[0].name()
|
||||||
try:
|
try:
|
||||||
if int(re.search(r'wsrep_cluster_size\s+(\d+)', oc.selector(
|
cluster_size = podified.execute_in_pod(
|
||||||
'pod/openstack-cell1-galera-0').object().execute(
|
pod_name, galera_cluster_size.format(passwd=pw), 'galera')
|
||||||
['sh', '-c', """mysql -u root --password=12345678 -e 'SHOW
|
|
||||||
STATUS LIKE "wsrep_cluster_size"'"""], container_name='galera'
|
|
||||||
).out()).group(1)) == galera_pods_num:
|
|
||||||
LOG.info('all galera cells are restored')
|
|
||||||
return
|
|
||||||
except oc.OpenShiftPythonException:
|
except oc.OpenShiftPythonException:
|
||||||
continue
|
continue
|
||||||
return False
|
|
||||||
|
wsrep_cluster_size = int(re.search(r'wsrep_cluster_size\s+(\d+)',
|
||||||
|
cluster_size.out()).group(1))
|
||||||
|
if wsrep_cluster_size == len(pods):
|
||||||
|
LOG.info('all galera cells are restored')
|
||||||
|
return
|
||||||
|
|
||||||
|
raise RestoredException()
|
||||||
|
@ -31,3 +31,13 @@ class DisruptPodifiedNodesTest(testtools.TestCase):
|
|||||||
# HealthCheck.run_before()
|
# HealthCheck.run_before()
|
||||||
cloud_disruptions.kill_all_galera_services()
|
cloud_disruptions.kill_all_galera_services()
|
||||||
# HealthCheck.run_after()
|
# HealthCheck.run_after()
|
||||||
|
|
||||||
|
def test_remove_all_grastate_galera(self):
|
||||||
|
# HealthCheck.run_before()
|
||||||
|
cloud_disruptions.remove_all_grastate_galera()
|
||||||
|
# HealthCheck.run_before()
|
||||||
|
|
||||||
|
def test_remove_one_grastate_galera(self):
|
||||||
|
# HealthCheck.run_before()
|
||||||
|
cloud_disruptions.remove_one_grastate_galera()
|
||||||
|
# HealthCheck.run_after()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user