diff --git a/stacklight_tests/helpers/remote_ops.py b/stacklight_tests/helpers/remote_ops.py index fb50960..a866329 100644 --- a/stacklight_tests/helpers/remote_ops.py +++ b/stacklight_tests/helpers/remote_ops.py @@ -83,6 +83,38 @@ def get_pids_of_process(remote, name): return result['stdout'][0].strip().split() +def ban_resource(remote, resource, wait=None): + """Ban a resource from the current node. + + :param remote: SSH connection to the node. + :type remote: SSHClient + :param resource: resource name. + :type name: str + :param wait: number of seconds to wait for the operation to complete. + :type operation: int + """ + cmd = "pcs resource ban {}".format(resource) + if wait is not None: + cmd = "{} --wait={}".format(cmd, wait) + remote.check_call(cmd) + + +def clear_resource(remote, resource, wait=None): + """Clear a resource. + + :param remote: SSH connection to the node. + :type remote: SSHClient + :param resource: resource name. + :type name: str + :param wait: number of seconds to wait for the operation to complete. + :type operation: int + """ + cmd = "pcs resource clear {}".format(resource) + if wait is not None: + cmd = "{} --wait={}".format(cmd, wait) + remote.check_call(cmd) + + def manage_pacemaker_service(remote, name, operation="restart"): """Operate HA service on remote node. diff --git a/stacklight_tests/toolchain/api.py b/stacklight_tests/toolchain/api.py index 1dc7f5e..fd73e7c 100644 --- a/stacklight_tests/toolchain/api.py +++ b/stacklight_tests/toolchain/api.py @@ -329,21 +329,28 @@ class ToolchainApi(object): filter_by = "node_role" if alarm_type == "service": filter_by = "service" - query = ( - "select last(value) from {select_from} where time >= {time}" - " and source = '{source}' and {filter} and hostname = '{hostname}'" - " and value = {value}".format( - select_from="{}_status".format(alarm_type), time=time_interval, - source=source, hostname=hostname, value=value, - filter="{} = '{}'".format(filter_by, filter_value))) + filters = [ + "time >= {}".format(time_interval), + "source = '{}'".format(source), + "{} = '{}'".format(filter_by, filter_value), + "value = {}".format(value) + ] + if hostname is not None: + filters.append("hostname = '{}'".format(hostname)) + + query = "select last(value) from {select_from} where {filters}".format( + select_from="{}_status".format(alarm_type), + filters=" and ".join(filters)) + logger.info("InfluxDB query: {}".format(query)) def check_result(): result = self.INFLUXDB_GRAFANA.do_influxdb_query( query=query).json()["results"][0] return len(result) - msg = ("Alarm with source {} and {} {} and value {} was" - " not triggered".format(source, filter_by, filter_value, value)) + msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, " + "value: {} wasn't triggered".format(alarm_type, filter_value, + source, hostname, value)) devops_helpers.wait(check_result, timeout=60 * 5, interval=10, timeout_msg=msg) diff --git a/stacklight_tests/toolchain/test_alarms.py b/stacklight_tests/toolchain/test_alarms.py index c6e4600..345446a 100644 --- a/stacklight_tests/toolchain/test_alarms.py +++ b/stacklight_tests/toolchain/test_alarms.py @@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api OKAY_STATUS = 0 WARNING_STATUS = 1 +UNKNOWN_STATUS = 2 CRITICAL_STATUS = 3 +DOWN_STATUS = 4 + WARNING_PERCENT = 91 CRITICAL_PERCENT = 96 + RABBITMQ_DISK_WARNING_PERCENT = 99.99 RABBITMQ_DISK_CRITICAL_PERCENT = 100 RABBITMQ_MEMORY_WARNING_VALUE = 1.01 @@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi): RABBITMQ_MEMORY_WARNING_VALUE) self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS, RABBITMQ_MEMORY_CRITICAL_VALUE) + + @test(depends_on_groups=["deploy_ha_toolchain"], + groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"]) + @log_snapshot_after_test + def check_rabbitmq_pacemaker_alarms(self): + """Check that rabbitmq-pacemaker-* alarms work as expected. + + Scenario: + 1. Stop one slave RabbitMQ instance. + 2. Check that the status of the RabbitMQ cluster is warning. + 3. Stop the second slave RabbitMQ instance. + 4. Check that the status of the RabbitMQ cluster is critical. + 5. Stop the master RabbitMQ instance. + 6. Check that the status of the RabbitMQ cluster is down. + 7. Clear the RabbitMQ resource. + 8. Check that the status of the RabbitMQ cluster is okay. + + Duration 10m + """ + def ban_and_check_status(node, status, wait=None): + with self.fuel_web.get_ssh_for_node(node.name) as remote: + logger.info("Ban rabbitmq resource on {}".format(node.name)) + self.remote_ops.ban_resource(remote, + 'master_p_rabbitmq-server', + wait=wait) + self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', + None, status) + + self.env.revert_snapshot("deploy_ha_toolchain") + + self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', + None, OKAY_STATUS) + + controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ["controller"]) + + controller = controllers[0] + controller_node = self.fuel_web.get_devops_node_by_nailgun_node( + controller) + rabbitmq_master = self.fuel_web.get_rabbit_master_node( + controller_node.name) + rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node( + controller_node.name) + ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120) + ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120) + # Don't wait for the pcs operation to complete as it will fail since + # the resource isn't running anywhere + ban_and_check_status(rabbitmq_master, DOWN_STATUS) + + logger.info("Clear rabbitmq resource") + with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote: + self.remote_ops.clear_resource(remote, + 'master_p_rabbitmq-server', + wait=240) + self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', + None, OKAY_STATUS)