Merge "Add test for RabbitMQ cluster alarms"

This commit is contained in:
Jenkins 2016-09-02 13:57:38 +00:00 committed by Gerrit Code Review
commit 6b41a46fad
3 changed files with 108 additions and 9 deletions

View File

@ -83,6 +83,38 @@ def get_pids_of_process(remote, name):
return result['stdout'][0].strip().split()
def ban_resource(remote, resource, wait=None):
"""Ban a resource from the current node.
:param remote: SSH connection to the node.
:type remote: SSHClient
:param resource: resource name.
:type name: str
:param wait: number of seconds to wait for the operation to complete.
:type operation: int
"""
cmd = "pcs resource ban {}".format(resource)
if wait is not None:
cmd = "{} --wait={}".format(cmd, wait)
remote.check_call(cmd)
def clear_resource(remote, resource, wait=None):
"""Clear a resource.
:param remote: SSH connection to the node.
:type remote: SSHClient
:param resource: resource name.
:type name: str
:param wait: number of seconds to wait for the operation to complete.
:type operation: int
"""
cmd = "pcs resource clear {}".format(resource)
if wait is not None:
cmd = "{} --wait={}".format(cmd, wait)
remote.check_call(cmd)
def manage_pacemaker_service(remote, name, operation="restart"):
"""Operate HA service on remote node.

View File

@ -329,21 +329,28 @@ class ToolchainApi(object):
filter_by = "node_role"
if alarm_type == "service":
filter_by = "service"
query = (
"select last(value) from {select_from} where time >= {time}"
" and source = '{source}' and {filter} and hostname = '{hostname}'"
" and value = {value}".format(
select_from="{}_status".format(alarm_type), time=time_interval,
source=source, hostname=hostname, value=value,
filter="{} = '{}'".format(filter_by, filter_value)))
filters = [
"time >= {}".format(time_interval),
"source = '{}'".format(source),
"{} = '{}'".format(filter_by, filter_value),
"value = {}".format(value)
]
if hostname is not None:
filters.append("hostname = '{}'".format(hostname))
query = "select last(value) from {select_from} where {filters}".format(
select_from="{}_status".format(alarm_type),
filters=" and ".join(filters))
logger.info("InfluxDB query: {}".format(query))
def check_result():
result = self.INFLUXDB_GRAFANA.do_influxdb_query(
query=query).json()["results"][0]
return len(result)
msg = ("Alarm with source {} and {} {} and value {} was"
" not triggered".format(source, filter_by, filter_value, value))
msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, "
"value: {} wasn't triggered".format(alarm_type, filter_value,
source, hostname, value))
devops_helpers.wait(check_result, timeout=60 * 5,
interval=10, timeout_msg=msg)

View File

@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api
OKAY_STATUS = 0
WARNING_STATUS = 1
UNKNOWN_STATUS = 2
CRITICAL_STATUS = 3
DOWN_STATUS = 4
WARNING_PERCENT = 91
CRITICAL_PERCENT = 96
RABBITMQ_DISK_WARNING_PERCENT = 99.99
RABBITMQ_DISK_CRITICAL_PERCENT = 100
RABBITMQ_MEMORY_WARNING_VALUE = 1.01
@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi):
RABBITMQ_MEMORY_WARNING_VALUE)
self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
RABBITMQ_MEMORY_CRITICAL_VALUE)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"])
@log_snapshot_after_test
def check_rabbitmq_pacemaker_alarms(self):
"""Check that rabbitmq-pacemaker-* alarms work as expected.
Scenario:
1. Stop one slave RabbitMQ instance.
2. Check that the status of the RabbitMQ cluster is warning.
3. Stop the second slave RabbitMQ instance.
4. Check that the status of the RabbitMQ cluster is critical.
5. Stop the master RabbitMQ instance.
6. Check that the status of the RabbitMQ cluster is down.
7. Clear the RabbitMQ resource.
8. Check that the status of the RabbitMQ cluster is okay.
Duration 10m
"""
def ban_and_check_status(node, status, wait=None):
with self.fuel_web.get_ssh_for_node(node.name) as remote:
logger.info("Ban rabbitmq resource on {}".format(node.name))
self.remote_ops.ban_resource(remote,
'master_p_rabbitmq-server',
wait=wait)
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
None, status)
self.env.revert_snapshot("deploy_ha_toolchain")
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
None, OKAY_STATUS)
controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ["controller"])
controller = controllers[0]
controller_node = self.fuel_web.get_devops_node_by_nailgun_node(
controller)
rabbitmq_master = self.fuel_web.get_rabbit_master_node(
controller_node.name)
rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node(
controller_node.name)
ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120)
ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120)
# Don't wait for the pcs operation to complete as it will fail since
# the resource isn't running anywhere
ban_and_check_status(rabbitmq_master, DOWN_STATUS)
logger.info("Clear rabbitmq resource")
with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote:
self.remote_ops.clear_resource(remote,
'master_p_rabbitmq-server',
wait=240)
self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
None, OKAY_STATUS)