Merge "Add test for RabbitMQ cluster alarms"

2016-09-02 13:57:38 +00:00 · 2016-09-02 13:57:38 +00:00 · 6b41a46fad
commit 6b41a46fad
parent b0d0ba18f8 2fd575a53f
3 changed files with 108 additions and 9 deletions
--- a/stacklight_tests/helpers/remote_ops.py
+++ b/stacklight_tests/helpers/remote_ops.py
@ -83,6 +83,38 @@ def get_pids_of_process(remote, name):
    return result['stdout'][0].strip().split()


+def ban_resource(remote, resource, wait=None):
+    """Ban a resource from the current node.
+
+        :param remote: SSH connection to the node.
+        :type remote: SSHClient
+        :param resource: resource name.
+        :type name: str
+        :param wait: number of seconds to wait for the operation to complete.
+        :type operation: int
+    """
+    cmd = "pcs resource ban {}".format(resource)
+    if wait is not None:
+        cmd = "{} --wait={}".format(cmd, wait)
+    remote.check_call(cmd)
+
+
+def clear_resource(remote, resource, wait=None):
+    """Clear a resource.
+
+        :param remote: SSH connection to the node.
+        :type remote: SSHClient
+        :param resource: resource name.
+        :type name: str
+        :param wait: number of seconds to wait for the operation to complete.
+        :type operation: int
+    """
+    cmd = "pcs resource clear {}".format(resource)
+    if wait is not None:
+        cmd = "{} --wait={}".format(cmd, wait)
+    remote.check_call(cmd)
+
+
 def manage_pacemaker_service(remote, name, operation="restart"):
    """Operate HA service on remote node.

--- a/stacklight_tests/toolchain/api.py
+++ b/stacklight_tests/toolchain/api.py
@ -329,21 +329,28 @@ class ToolchainApi(object):
        filter_by = "node_role"
        if alarm_type == "service":
            filter_by = "service"
-        query = (
-            "select last(value) from {select_from} where time >= {time}"
-            " and source = '{source}' and {filter} and hostname = '{hostname}'"
-            " and value = {value}".format(
-                select_from="{}_status".format(alarm_type), time=time_interval,
-                source=source, hostname=hostname, value=value,
-                filter="{} = '{}'".format(filter_by, filter_value)))
+        filters = [
+            "time >= {}".format(time_interval),
+            "source = '{}'".format(source),
+            "{} = '{}'".format(filter_by, filter_value),
+            "value = {}".format(value)
+        ]
+        if hostname is not None:
+            filters.append("hostname = '{}'".format(hostname))
+
+        query = "select last(value) from {select_from} where {filters}".format(
+                select_from="{}_status".format(alarm_type),
+                filters=" and ".join(filters))
+        logger.info("InfluxDB query: {}".format(query))

        def check_result():
            result = self.INFLUXDB_GRAFANA.do_influxdb_query(
                query=query).json()["results"][0]
            return len(result)

-        msg = ("Alarm with source {} and {} {} and value {} was"
-               " not triggered".format(source, filter_by, filter_value, value))
+        msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, "
+               "value: {} wasn't triggered".format(alarm_type, filter_value,
+                                                   source, hostname, value))
        devops_helpers.wait(check_result, timeout=60 * 5,
                            interval=10, timeout_msg=msg)

--- a/stacklight_tests/toolchain/test_alarms.py
+++ b/stacklight_tests/toolchain/test_alarms.py
@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api

 OKAY_STATUS = 0
 WARNING_STATUS = 1
+UNKNOWN_STATUS = 2
 CRITICAL_STATUS = 3
+DOWN_STATUS = 4
+
 WARNING_PERCENT = 91
 CRITICAL_PERCENT = 96
+
 RABBITMQ_DISK_WARNING_PERCENT = 99.99
 RABBITMQ_DISK_CRITICAL_PERCENT = 100
 RABBITMQ_MEMORY_WARNING_VALUE = 1.01
@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi):
                                            RABBITMQ_MEMORY_WARNING_VALUE)
        self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
                                            RABBITMQ_MEMORY_CRITICAL_VALUE)
+
+    @test(depends_on_groups=["deploy_ha_toolchain"],
+          groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"])
+    @log_snapshot_after_test
+    def check_rabbitmq_pacemaker_alarms(self):
+        """Check that rabbitmq-pacemaker-* alarms work as expected.
+
+        Scenario:
+            1. Stop one slave RabbitMQ instance.
+            2. Check that the status of the RabbitMQ cluster is warning.
+            3. Stop the second slave RabbitMQ instance.
+            4. Check that the status of the RabbitMQ cluster is critical.
+            5. Stop the master RabbitMQ instance.
+            6. Check that the status of the RabbitMQ cluster is down.
+            7. Clear the RabbitMQ resource.
+            8. Check that the status of the RabbitMQ cluster is okay.
+
+        Duration 10m
+        """
+        def ban_and_check_status(node, status, wait=None):
+            with self.fuel_web.get_ssh_for_node(node.name) as remote:
+                logger.info("Ban rabbitmq resource on {}".format(node.name))
+                self.remote_ops.ban_resource(remote,
+                                             'master_p_rabbitmq-server',
+                                             wait=wait)
+            self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
+                              None, status)
+
+        self.env.revert_snapshot("deploy_ha_toolchain")
+
+        self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
+                          None, OKAY_STATUS)
+
+        controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
+            self.helpers.cluster_id, ["controller"])
+
+        controller = controllers[0]
+        controller_node = self.fuel_web.get_devops_node_by_nailgun_node(
+            controller)
+        rabbitmq_master = self.fuel_web.get_rabbit_master_node(
+            controller_node.name)
+        rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node(
+            controller_node.name)
+        ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120)
+        ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120)
+        # Don't wait for the pcs operation to complete as it will fail since
+        # the resource isn't running anywhere
+        ban_and_check_status(rabbitmq_master, DOWN_STATUS)
+
+        logger.info("Clear rabbitmq resource")
+        with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote:
+            self.remote_ops.clear_resource(remote,
+                                           'master_p_rabbitmq-server',
+                                           wait=240)
+        self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
+                          None, OKAY_STATUS)