Merge "Add test for RabbitMQ cluster alarms"
This commit is contained in:
commit
6b41a46fad
@ -83,6 +83,38 @@ def get_pids_of_process(remote, name):
|
||||
return result['stdout'][0].strip().split()
|
||||
|
||||
|
||||
def ban_resource(remote, resource, wait=None):
|
||||
"""Ban a resource from the current node.
|
||||
|
||||
:param remote: SSH connection to the node.
|
||||
:type remote: SSHClient
|
||||
:param resource: resource name.
|
||||
:type name: str
|
||||
:param wait: number of seconds to wait for the operation to complete.
|
||||
:type operation: int
|
||||
"""
|
||||
cmd = "pcs resource ban {}".format(resource)
|
||||
if wait is not None:
|
||||
cmd = "{} --wait={}".format(cmd, wait)
|
||||
remote.check_call(cmd)
|
||||
|
||||
|
||||
def clear_resource(remote, resource, wait=None):
|
||||
"""Clear a resource.
|
||||
|
||||
:param remote: SSH connection to the node.
|
||||
:type remote: SSHClient
|
||||
:param resource: resource name.
|
||||
:type name: str
|
||||
:param wait: number of seconds to wait for the operation to complete.
|
||||
:type operation: int
|
||||
"""
|
||||
cmd = "pcs resource clear {}".format(resource)
|
||||
if wait is not None:
|
||||
cmd = "{} --wait={}".format(cmd, wait)
|
||||
remote.check_call(cmd)
|
||||
|
||||
|
||||
def manage_pacemaker_service(remote, name, operation="restart"):
|
||||
"""Operate HA service on remote node.
|
||||
|
||||
|
@ -329,21 +329,28 @@ class ToolchainApi(object):
|
||||
filter_by = "node_role"
|
||||
if alarm_type == "service":
|
||||
filter_by = "service"
|
||||
query = (
|
||||
"select last(value) from {select_from} where time >= {time}"
|
||||
" and source = '{source}' and {filter} and hostname = '{hostname}'"
|
||||
" and value = {value}".format(
|
||||
select_from="{}_status".format(alarm_type), time=time_interval,
|
||||
source=source, hostname=hostname, value=value,
|
||||
filter="{} = '{}'".format(filter_by, filter_value)))
|
||||
filters = [
|
||||
"time >= {}".format(time_interval),
|
||||
"source = '{}'".format(source),
|
||||
"{} = '{}'".format(filter_by, filter_value),
|
||||
"value = {}".format(value)
|
||||
]
|
||||
if hostname is not None:
|
||||
filters.append("hostname = '{}'".format(hostname))
|
||||
|
||||
query = "select last(value) from {select_from} where {filters}".format(
|
||||
select_from="{}_status".format(alarm_type),
|
||||
filters=" and ".join(filters))
|
||||
logger.info("InfluxDB query: {}".format(query))
|
||||
|
||||
def check_result():
|
||||
result = self.INFLUXDB_GRAFANA.do_influxdb_query(
|
||||
query=query).json()["results"][0]
|
||||
return len(result)
|
||||
|
||||
msg = ("Alarm with source {} and {} {} and value {} was"
|
||||
" not triggered".format(source, filter_by, filter_value, value))
|
||||
msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, "
|
||||
"value: {} wasn't triggered".format(alarm_type, filter_value,
|
||||
source, hostname, value))
|
||||
devops_helpers.wait(check_result, timeout=60 * 5,
|
||||
interval=10, timeout_msg=msg)
|
||||
|
||||
|
@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api
|
||||
|
||||
OKAY_STATUS = 0
|
||||
WARNING_STATUS = 1
|
||||
UNKNOWN_STATUS = 2
|
||||
CRITICAL_STATUS = 3
|
||||
DOWN_STATUS = 4
|
||||
|
||||
WARNING_PERCENT = 91
|
||||
CRITICAL_PERCENT = 96
|
||||
|
||||
RABBITMQ_DISK_WARNING_PERCENT = 99.99
|
||||
RABBITMQ_DISK_CRITICAL_PERCENT = 100
|
||||
RABBITMQ_MEMORY_WARNING_VALUE = 1.01
|
||||
@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi):
|
||||
RABBITMQ_MEMORY_WARNING_VALUE)
|
||||
self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
|
||||
RABBITMQ_MEMORY_CRITICAL_VALUE)
|
||||
|
||||
@test(depends_on_groups=["deploy_ha_toolchain"],
|
||||
groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"])
|
||||
@log_snapshot_after_test
|
||||
def check_rabbitmq_pacemaker_alarms(self):
|
||||
"""Check that rabbitmq-pacemaker-* alarms work as expected.
|
||||
|
||||
Scenario:
|
||||
1. Stop one slave RabbitMQ instance.
|
||||
2. Check that the status of the RabbitMQ cluster is warning.
|
||||
3. Stop the second slave RabbitMQ instance.
|
||||
4. Check that the status of the RabbitMQ cluster is critical.
|
||||
5. Stop the master RabbitMQ instance.
|
||||
6. Check that the status of the RabbitMQ cluster is down.
|
||||
7. Clear the RabbitMQ resource.
|
||||
8. Check that the status of the RabbitMQ cluster is okay.
|
||||
|
||||
Duration 10m
|
||||
"""
|
||||
def ban_and_check_status(node, status, wait=None):
|
||||
with self.fuel_web.get_ssh_for_node(node.name) as remote:
|
||||
logger.info("Ban rabbitmq resource on {}".format(node.name))
|
||||
self.remote_ops.ban_resource(remote,
|
||||
'master_p_rabbitmq-server',
|
||||
wait=wait)
|
||||
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
|
||||
None, status)
|
||||
|
||||
self.env.revert_snapshot("deploy_ha_toolchain")
|
||||
|
||||
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
|
||||
None, OKAY_STATUS)
|
||||
|
||||
controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ["controller"])
|
||||
|
||||
controller = controllers[0]
|
||||
controller_node = self.fuel_web.get_devops_node_by_nailgun_node(
|
||||
controller)
|
||||
rabbitmq_master = self.fuel_web.get_rabbit_master_node(
|
||||
controller_node.name)
|
||||
rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node(
|
||||
controller_node.name)
|
||||
ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120)
|
||||
ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120)
|
||||
# Don't wait for the pcs operation to complete as it will fail since
|
||||
# the resource isn't running anywhere
|
||||
ban_and_check_status(rabbitmq_master, DOWN_STATUS)
|
||||
|
||||
logger.info("Clear rabbitmq resource")
|
||||
with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote:
|
||||
self.remote_ops.clear_resource(remote,
|
||||
'master_p_rabbitmq-server',
|
||||
wait=240)
|
||||
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
|
||||
None, OKAY_STATUS)
|
||||
|
Loading…
x
Reference in New Issue
Block a user