From e4b4ef93de67e8a682ec2d551e273a782c8548e2 Mon Sep 17 00:00:00 2001 From: Vladimir Ushakov Date: Thu, 16 Jun 2016 13:08:39 +0300 Subject: [PATCH] Add four toolchain functional tests Add four toolchain functional tests. Add toolchain helpers methods. Change-Id: I0ea3613d8e8e44a2dfa9d5a9bea26c0e9b793ee4 --- stacklight_tests/helpers/checkers.py | 31 ++ stacklight_tests/helpers/helpers.py | 4 +- stacklight_tests/helpers/remote_ops.py | 22 +- stacklight_tests/influxdb_grafana/api.py | 36 ++ .../lma_infrastructure_alerting/api.py | 91 ++++- stacklight_tests/toolchain/api.py | 133 ++++++- stacklight_tests/toolchain/test_functional.py | 334 ++++++++++++++++++ .../toolchain/toolchain_settings.py | 6 + 8 files changed, 639 insertions(+), 18 deletions(-) diff --git a/stacklight_tests/helpers/checkers.py b/stacklight_tests/helpers/checkers.py index efa36d6..b11c4d1 100644 --- a/stacklight_tests/helpers/checkers.py +++ b/stacklight_tests/helpers/checkers.py @@ -15,6 +15,8 @@ from contextlib import closing import socket +from devops.error import DevopsCalledProcessError +from devops.helpers import helpers as devops_helpers from proboscis import asserts import requests from requests.packages.urllib3 import poolmanager @@ -83,3 +85,32 @@ def check_port(address, port): """ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: return sock.connect_ex((address, port)) == 0 + + +def check_local_mail(remote, node_name, service, state, timeout=10 * 60): + """Check that email from LMA Infrastructure Alerting plugin about service + changing it's state is presented on a host. + + :param remote: SSH connection to the node. + :type remote: SSHClient + :param node_name: name of the node to check for email on. + :type node_name: str + :param message: message to look for. + :type message: str + :param timeout: timeout to wait for email to arrive. + :rtype timeout: int + """ + def check_mail(): + try: + responce = remote.check_call("cat $MAIL") + if not responce: + return False + if ("Service: {}\n".format(service) in responce['stdout'] and + "State: {}\n".format(state) in responce['stdout']): + return True + except DevopsCalledProcessError: + return False + msg = ("Email about service {0} in {1} state was not " + "found on {2} after {3} seconds").format( + service, state, node_name, timeout) + devops_helpers.wait(check_mail, timeout=timeout, timeout_msg=msg) diff --git a/stacklight_tests/helpers/helpers.py b/stacklight_tests/helpers/helpers.py index c8395fe..7777965 100644 --- a/stacklight_tests/helpers/helpers.py +++ b/stacklight_tests/helpers/helpers.py @@ -576,7 +576,7 @@ class PluginHelper(object): for service in ha_services: remote_ops.manage_pacemaker_service(remote, service) for service in non_ha_services: - remote_ops.manage_initctl_service(remote, service) + remote_ops.manage_service(remote, service) logger.info("Restarting services on computes") compute_services = ( @@ -586,7 +586,7 @@ class PluginHelper(object): for compute in computes: with self.fuel_web.get_ssh_for_nailgun_node(compute) as remote: for service in compute_services: - remote_ops.manage_initctl_service(remote, service) + remote_ops.manage_service(remote, service) @staticmethod def check_notifications(got_list, expected_list): diff --git a/stacklight_tests/helpers/remote_ops.py b/stacklight_tests/helpers/remote_ops.py index a866329..90edadd 100644 --- a/stacklight_tests/helpers/remote_ops.py +++ b/stacklight_tests/helpers/remote_ops.py @@ -129,7 +129,7 @@ def manage_pacemaker_service(remote, name, operation="restart"): operation=operation, service=name)) -def manage_initctl_service(remote, name, operation="restart"): +def manage_service(remote, name, operation="restart"): """Operate service on remote node. :param remote: SSH connection to the node. @@ -139,8 +139,24 @@ def manage_initctl_service(remote, name, operation="restart"): :param operation: type of operation, usually start, stop or restart. :type operation: str """ - remote.check_call("initctl {operation} {service}".format( - operation=operation, service=name)) + + if remote.execute("service {} status".format(name))['exit_code'] == 0: + service_cmd = 'service {service} {operation}' + elif remote.execute("initctl status {}".format(name))['exit_code'] == 0: + service_cmd = 'initctl {operation} {service}' + else: + raise Exception('no service handler!') + + remote.check_call(service_cmd.format(service=name, operation=operation)) + + +def clear_local_mail(remote): + """Clean local mail + + :param remote: SSH connection to the node. + :type remote: SSHClient + """ + remote.check_call("rm -f $MAIL") def fill_up_filesystem(remote, fs, percent, file_name): diff --git a/stacklight_tests/influxdb_grafana/api.py b/stacklight_tests/influxdb_grafana/api.py index bb1eb8c..91f35b4 100644 --- a/stacklight_tests/influxdb_grafana/api.py +++ b/stacklight_tests/influxdb_grafana/api.py @@ -22,6 +22,10 @@ from stacklight_tests.influxdb_grafana.grafana_ui import api as ui_api from stacklight_tests.influxdb_grafana import plugin_settings +class NotFound(Exception): + pass + + class InfluxdbPluginApi(base_test.PluginApi): def __init__(self): super(InfluxdbPluginApi, self).__init__() @@ -179,3 +183,35 @@ class InfluxdbPluginApi(base_test.PluginApi): if result: return result["series"][0]["values"] return [] + + def check_cluster_status(self, name, expected_status, interval='3m'): + output = ("SELECT last(value) FROM cluster_status WHERE " + "time > now() - {0} AND cluster_name='{1}'".format(interval, + name)) + msg_header = "Wrong '{0}' service state has been found!".format( + name) + self._check_influx_query_last_value(output, expected_status, + msg_header) + + def check_count_of_haproxy_backends(self, service, node_state='down', + expected_count=0, interval='3m'): + + query = ("SELECT last(value) FROM haproxy_backend_servers WHERE " + "backend='{0}' AND state='{1}' and " + "time > now() - {2}".format(service, node_state, interval)) + + msg_header = ("Wrong amout of nodes with service '{0}' " + "in '{1}' state!".format(service, node_state)) + self._check_influx_query_last_value(query, expected_count, msg_header) + + def _check_influx_query_last_value(self, query, expected_value, + msg_header): + output = self.do_influxdb_query(query) + lines = output.json() + if not lines['results'][0]: + logger.error("The query ['result'] is empty!") + raise NotFound + state = lines['results'][0]['series'][0]['values'][0][1] + asserts.assert_equal(expected_value, state, + msg_header + " Expected {0} but" + " found {1}".format(expected_value, state)) diff --git a/stacklight_tests/lma_infrastructure_alerting/api.py b/stacklight_tests/lma_infrastructure_alerting/api.py index 5206c86..6279538 100644 --- a/stacklight_tests/lma_infrastructure_alerting/api.py +++ b/stacklight_tests/lma_infrastructure_alerting/api.py @@ -11,10 +11,13 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +import six.moves as sm +from devops.helpers import helpers from fuelweb_test import logger from proboscis import asserts +from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait @@ -90,9 +93,7 @@ class InfraAlertingPluginApi(base_test.PluginApi): return "{0}://{1}:{2}".format(self.nagios_protocol, self.get_nagios_vip(), self.nagios_port) - def open_nagios_page(self, link_text, anchor): - driver = self.ui_tester.get_driver(self.get_authenticated_nagios_url(), - "//frame[2]", "Nagios Core") + def open_nagios_page(self, driver, link_text, anchor): driver.switch_to.default_content() driver.switch_to.frame(driver.find_element_by_name("side")) link = driver.find_element_by_link_text(link_text) @@ -104,19 +105,19 @@ class InfraAlertingPluginApi(base_test.PluginApi): return driver def check_node_in_nagios(self, changed_node, state): - driver = self.open_nagios_page( - 'Hosts', "//table[@class='headertable']") - try: + with self.ui_tester.ui_driver( + self.get_authenticated_nagios_url(), + "//frame[2]", "Nagios Core") as driver: + driver = self.open_nagios_page( + driver, 'Hosts', "//table[@class='headertable']") asserts.assert_equal(state, self.node_is_present( - driver, changed_node), "Failed to find node '{0}' on nagios!" - .format(changed_node)) - finally: - driver.close() + driver, changed_node), "Failed to find node '{0}' " + "on nagios!".format(changed_node)) def node_is_present(self, driver, name): table = self.ui_tester.get_table(driver, "/html/body/div[2]/table/tbody") - for ind in xrange(2, self.ui_tester.get_table_size(table) + 1): + for ind in sm.xrange(2, self.ui_tester.get_table_size(table) + 1): node_name = self.ui_tester.get_table_cell( table, ind, 1).text.rstrip() if name == node_name: @@ -131,3 +132,71 @@ class InfraAlertingPluginApi(base_test.PluginApi): def check_uninstall_failure(self): return self.helpers.check_plugin_cannot_be_uninstalled( self.settings.name, self.settings.version) + + def get_services_for_node(self, table, node_name, driver, + table_xpath="/html/body/table[3]/tbody"): + services = {} + found_node = False + ind = 2 + while ind < self.ui_tester.get_table_size(table) + 1: + try: + if not self.ui_tester.get_table_row(table, ind).text: + if found_node: + break + else: + continue + if self.ui_tester.get_table_cell( + table, ind, 1).text == node_name: + found_node = True + if found_node: + services[self.ui_tester.get_table_cell( + table, ind, 2).text] = ( + self.ui_tester.get_table_cell(table, ind, 3).text) + except StaleElementReferenceException: + table = self.ui_tester.get_table(driver, table_xpath) + ind -= 1 + ind += 1 + + return services + + def check_service_state_on_nagios(self, driver, service_state=None, + node_names=None): + self.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + table = self.ui_tester.get_table(driver, "/html/body/table[3]/tbody") + if not node_names: + node_names = [self.ui_tester.get_table_cell(table, 2, 1).text] + for node in node_names: + node_services = self.get_services_for_node(table, node, driver) + if service_state: + for service in service_state: + if service_state[service] != node_services[service]: + return False + else: + for service in node_services: + if 'OK' != node_services[service]: + return False + return True + + def wait_service_state_on_nagios(self, driver, service_state=None, + node_names=None): + msg = ("Fail to get expected service states for services: {0} " + "on nodes: {1}") + + if not service_state or not node_names: + self.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + table = self.ui_tester.get_table(driver, + "/html/body/table[3]/tbody") + if not node_names: + node_names = [self.ui_tester.get_table_cell(table, 2, 1).text] + if not service_state: + service_state = dict((key, 'OK') for key in + self.get_services_for_node( + table, node_names[0], driver)) + + msg = msg.format([key for key in service_state], node_names) + + helpers.wait(lambda: self.check_service_state_on_nagios( + driver, service_state, node_names), timeout=60 * 5, + timeout_msg=msg) diff --git a/stacklight_tests/toolchain/api.py b/stacklight_tests/toolchain/api.py index fd73e7c..2349f63 100644 --- a/stacklight_tests/toolchain/api.py +++ b/stacklight_tests/toolchain/api.py @@ -45,8 +45,8 @@ class ToolchainApi(object): self.ELASTICSEARCH_KIBANA = elasticsearch_api.ElasticsearchPluginApi() self.INFLUXDB_GRAFANA = influx_api.InfluxdbPluginApi() self.LMA_COLLECTOR = collector_api.LMACollectorPluginApi() - self.LMA_INFRASTRUCTURE_ALERTING = \ - infrastructure_alerting_api.InfraAlertingPluginApi() + self.LMA_INFRASTRUCTURE_ALERTING = ( + infrastructure_alerting_api.InfraAlertingPluginApi()) self._plugins = { self.ELASTICSEARCH_KIBANA, self.INFLUXDB_GRAFANA, @@ -373,3 +373,132 @@ class ToolchainApi(object): msg = "Failed to set vm_memory_high_watermark to {}".format(limit) devops_helpers.wait(check_result, timeout=timeout, interval=10, timeout_msg=msg) + + def change_verify_service_state(self, service_name, action, new_state, + service_state_in_influx, + down_backends_in_haproxy, toolchain_node, + controller_nodes, nagios_driver): + """Verify that the alerts for services show up in the Grafana + and Nagios UI. + + :param service_name: name of the service to change state of. + Format [service name, service name + on dashboard] e.g. ['nova-api', 'nova'] + :type service_name: list. + :param action: action to perform (e.g. stop, start). + :type action: str + :param new_state: new state of the service. + :type new_state: str + :param service_state_in_influx: new state of the service in influx. + :type new_state: int + :param down_backends_in_haproxy: amout of backends in 'down' state. + :type down_backends_in_haproxy: int + :param toolchain_node: toolchain node with + infrastructure_alerting_ui vip. + :type toolchain_node: dict + :param controller_nodes: list of the controller nodes to change + service state on. + :type controller_nodes: list + :param nagios_driver: selenium web driver + service state on. + :type nagios_driver: WebDriver + """ + + logger.info("Changing state of service {0}. " + "New state is {1}".format(service_name[0], new_state)) + with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote: + self.remote_ops.clear_local_mail(remote) + for node in controller_nodes: + with self.helpers.fuel_web.get_ssh_for_nailgun_node( + node) as remote: + self.remote_ops.manage_service(remote, service_name[0], action) + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {service_name[1]: new_state}) + self.INFLUXDB_GRAFANA.check_cluster_status( + service_name[1], service_state_in_influx) + self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends( + service_name[0], expected_count=down_backends_in_haproxy) + with self.helpers.fuel_web.get_ssh_for_nailgun_node( + toolchain_node) as remote: + self.checkers.check_local_mail( + remote, toolchain_node["name"], service_name[1], new_state) + + def change_verify_node_service_state(self, services, state, influx_state, + percent, toolchain_node, + controller_nodes, nagios_driver): + """Verify that the alerts for nodes show up in the Grafana + and Nagios UI. + + :param services: list of services to check new status of. Format + ['mysql', 'mysql-nodes.mysql-fs'] + :type services: list + :param state: new state of the service. + :type state: str + :param influx_state: new influx state. + :type influx_state: int + :param percent: amount of space to be filled on a node. + :type percent: int + :param toolchain_node: toolchain node with + infrastructure_alerting_ui vip. + :type toolchain_node: dict + :param controller_nodes: list of the controller nodes to change + service state on. + :type controller_nodes: list + :param nagios_driver: selenium web driver + service state on. + :type nagios_driver: WebDriver + + """ + + with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote: + self.remote_ops.clear_local_mail(remote) + + with self.fuel_web.get_ssh_for_nailgun_node( + controller_nodes[0]) as remote: + self.remote_ops.fill_up_filesystem( + remote, "/dev/mapper/mysql-root", percent, + "/var/lib/mysql/test/bigfile") + + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[0]: 'OK'}) + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[1]: state}, + [controller_nodes[0]['hostname']]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[0], + self.settings.OKAY) + + with self.fuel_web.get_ssh_for_nailgun_node( + controller_nodes[1]) as remote: + self.remote_ops.fill_up_filesystem( + remote, "/dev/mapper/mysql-root", percent, + "/var/lib/mysql/test/bigfile") + + for node in controller_nodes: + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[0]: state}) + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[1]: state}, [node['hostname']]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[0], influx_state) + + with self.helpers.fuel_web.get_ssh_for_nailgun_node( + toolchain_node) as remote: + self.checkers.check_local_mail( + remote, toolchain_node["name"], services[0], state) + + for node in controller_nodes: + with self.fuel_web.get_ssh_for_nailgun_node(node) as remote: + self.remote_ops.clean_filesystem(remote, + "/var/lib/mysql/test/bigfile") + + for node in controller_nodes: + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[0]: 'OK'}) + self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios( + nagios_driver, {services[1]: 'OK'}, [node['hostname']]) + self.INFLUXDB_GRAFANA.check_cluster_status(services[0], + self.settings.OKAY) + + with self.helpers.fuel_web.get_ssh_for_nailgun_node( + toolchain_node) as remote: + self.checkers.check_local_mail( + remote, toolchain_node["name"], services[0], 'OK') diff --git a/stacklight_tests/toolchain/test_functional.py b/stacklight_tests/toolchain/test_functional.py index a64f62c..125aa2f 100644 --- a/stacklight_tests/toolchain/test_functional.py +++ b/stacklight_tests/toolchain/test_functional.py @@ -13,6 +13,7 @@ # under the License. from fuelweb_test.helpers.decorators import log_snapshot_after_test +from fuelweb_test import logger from proboscis import test from stacklight_tests.toolchain import api @@ -234,3 +235,336 @@ class TestFunctionalToolchain(api.ToolchainApi): self.check_plugins_online() self.check_cinder_notifications() + + @test(depends_on_groups=["deploy_ha_toolchain"], + groups=["toolchain_warning_alert_service", "service_restart", + "toolchain", "functional"]) + @log_snapshot_after_test + def toolchain_warning_alert_service(self): + """Verify that the warning alerts for services show up in the + Grafana and Nagios UI. + + Scenario: + 1. Connect to one of the controller nodes using ssh and + stop the nova-api service. + 2. Wait for at least 1 minute. + 3. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'WARN' with an orange background, + - the API panels report 1 entity as down. + 4. On Nagios, check the following items: + - the 'nova' service is in 'WARNING' state, + - the local user root on the lma node has received + an email about the service + being in warning state. + 5. Restart the nova-api service. + 6. Wait for at least 1 minute. + 7. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'OKAY' with an green background, + - the API panels report 0 entity as down. + 8. On Nagios, check the following items: + - the 'nova' service is in 'OK' state, + - the local user root on the lma node has received + an email about the recovery + of the service. + 9. Repeat steps 2 to 8 for the following services: + - Nova (stopping and starting the nova-api and + nova-scheduler) + - Cinder (stopping and starting the cinder-api and + cinder-scheduler services respectively). + - Neutron (stopping and starting the neutron-server + and neutron-openvswitch-agent services respectively). + - Glance (stopping and starting the glance-api service). + - Heat (stopping and starting the heat-api service). + - Keystone (stopping and starting the Apache service). + + Duration 45m + """ + self.env.revert_snapshot("deploy_ha_toolchain") + + services = { + 'nova': ['nova-api', 'nova-scheduler'], + 'cinder': ['cinder-api', 'cinder-scheduler'], + 'neutron': ['neutron-server', 'neutron-openvswitch-agent'], + 'glance': ['glance-api'], + 'heat': ['heat-api'], + 'keystone': ['apache2'] + } + + lma_devops_node = self.helpers.get_node_with_vip( + self.settings.stacklight_roles, + self.helpers.full_vip_name( + self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip)) + toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( + lma_devops_node) + + url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() + with self.ui_tester.ui_driver(url, "//frame[2]", + "Nagios Core") as driver: + self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + controller_node = ( + self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ['controller'])[0]) + for key in services: + for service in services[key]: + self.change_verify_service_state( + service_name=[service, key], action='stop', + new_state='WARNING', + service_state_in_influx=self.settings.WARN, + down_backends_in_haproxy=1, + toolchain_node=toolchain_node, + controller_nodes=[controller_node], + nagios_driver=driver) + self.change_verify_service_state( + service_name=[service, key], action='start', + new_state='OK', + service_state_in_influx=self.settings.OKAY, + down_backends_in_haproxy=0, + toolchain_node=toolchain_node, + controller_nodes=[controller_node], + nagios_driver=driver) + + @test(depends_on_groups=["deploy_ha_toolchain"], + groups=["toolchain_critical_alert_service", "service_restart", + "toolchain", "functional"]) + # @log_snapshot_after_test + def toolchain_critical_alert_service(self): + """Verify that the critical alerts for services show up in + the Grafana and Nagios UI. + + Scenario: + 1. Open the Nagios URL + 2. Connect to one of the controller nodes using ssh and + stop the nova-api service. + 3. Connect to a second controller node using ssh and stop + the nova-api service. + 4. Wait for at least 1 minute. + 5. On Nagios, check the following items: + - the 'nova' service is in 'WARNING' state, + - the local user root on the lma node has received + an email about the service + being in warning state. + 6. Restart the nova-api service on both nodes. + 7. Wait for at least 1 minute. + 8. On Nagios, check the following items: + - the 'nova' service is in 'OK' state, + - the local user root on the lma node has received + an email about the recovery + of the service. + 9. Repeat steps 2 to 8 for the following services: + - Nova (stopping and starting the nova-api and + nova-scheduler) + - Cinder (stopping and starting the cinder-api and + cinder-scheduler services respectively). + - Neutron (stopping and starting the neutron-server + and neutron-openvswitch-agent services respectively). + - Glance (stopping and starting the glance-api service). + - Heat (stopping and starting the heat-api service). + - Keystone (stopping and starting the Apache service). + + Duration 45m + """ + self.env.revert_snapshot("deploy_ha_toolchain") + + services = { + 'nova': ['nova-api', 'nova-scheduler'], + 'cinder': ['cinder-api', 'cinder-scheduler'], + 'neutron': ['neutron-server', 'neutron-openvswitch-agent'], + 'glance': ['glance-api'], + 'heat': ['heat-api'], + 'keystone': ['apache2'] + } + + lma_devops_node = self.helpers.get_node_with_vip( + self.settings.stacklight_roles, + self.helpers.full_vip_name( + self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip)) + toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( + lma_devops_node) + + url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() + with self.ui_tester.ui_driver(url, "//frame[2]", + "Nagios Core") as driver: + self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + controller_nodes = ( + self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ['controller'])) + for key in services: + for service in services[key]: + logger.info("Checking service {0}".format(service)) + self.change_verify_service_state( + service_name=[service, key], action='stop', + new_state='CRITICAL', + service_state_in_influx=self.settings.CRIT, + down_backends_in_haproxy=2, + toolchain_node=toolchain_node, + controller_nodes=[controller_nodes[0], + controller_nodes[1]], + nagios_driver=driver) + self.change_verify_service_state( + service_name=[service, key], action='start', + new_state='OK', + service_state_in_influx=self.settings.OKAY, + down_backends_in_haproxy=0, + toolchain_node=toolchain_node, + controller_nodes=[controller_nodes[0], + controller_nodes[1]], + nagios_driver=driver) + + @test(depends_on_groups=["deploy_ha_toolchain"], + groups=["toolchain_warning_alert_node", "node_alert_warning", + "toolchain", "functional"]) + @log_snapshot_after_test + def toolchain_warning_alert_node(self): + """Verify that the warning alerts for nodes show up in the + Grafana and Nagios UI. + + Scenario: + 1. Open the Nagios URL + 2. Open the Grafana URl + 3. Connect to one of the controller nodes using ssh and + run: + fallocate -l $(df | grep /dev/mapper/mysql-root + | awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96 + / 100) - $3))}') /var/lib/mysql/test + 4. Wait for at least 1 minute. + 5. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'OKAY' with an green background, + 6. On Nagios, check the following items: + - the 'mysql' service is in 'OK' state, + - the 'mysql-nodes.mysql-fs' service is in 'WARNING' + state for the node. + 7. Connect to a second controller node using ssh and run: + fallocate -l $(df | grep /dev/mapper/mysql-root + | awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96 + / 100) - $3))}') /var/lib/mysql/test + 8. Wait for at least 1 minute. + 9. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'WARN' with an orange background, + - an annotation telling that the service went from 'OKAY' + to 'WARN' is displayed. + 10. On Nagios, check the following items: + - the 'mysql' service is in 'WARNING' state, + - the 'mysql-nodes.mysql-fs' service is in 'WARNING' + state for the 2 nodes, + - the local user root on the lma node has received an + email about the service + being in warning state. + 11. Run the following command on both controller nodes: + rm /var/lib/mysql/test + 12. Wait for at least 1 minutes. + 13. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'OKAY' with an green background, + - an annotation telling that the service went from 'WARN' + to 'OKAY' is displayed. + 14. On Nagios, check the following items: + - the 'mysql' service is in 'OK' state, + - the 'mysql-nodes.mysql-fs' service is in 'OKAY' state + for the 2 nodes, + - the local user root on the lma node has received an + email about the recovery of the service. + + Duration 15m + """ + self.env.revert_snapshot("deploy_ha_toolchain") + + lma_devops_node = self.helpers.get_node_with_vip( + self.settings.stacklight_roles, + self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip")) + toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( + lma_devops_node) + nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ['controller']) + + url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() + with self.ui_tester.ui_driver(url, "//frame[2]", + "Nagios Core") as driver: + self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + self.change_verify_node_service_state( + ['mysql', 'mysql-nodes.mysql-fs'], 'WARNING', + self.settings.WARN, '96', toolchain_node, + [nailgun_nodes[0], nailgun_nodes[1]], driver) + + @test(depends_on_groups=["deploy_ha_toolchain"], + groups=["toolchain_critical_alert_node", "node_alert_critical", + "toolchain", "functional"]) + @log_snapshot_after_test + def toolchain_critical_alert_node(self): + """Verify that the critical alerts for nodes show up in the + Grafana and Nagios UI. + + Scenario: + 1. Open the Nagios URL + 2. Open the Grafana URl + 3. Connect to one of the controller nodes using ssh and run: + fallocate -l $(df | grep /dev/mapper/mysql-root + | awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * + 98 / 100) - $3))}') /var/lib/mysql/test + 4. Wait for at least 1 minute. + 5. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'OKAY' with an green background, + 6. On Nagios, check the following items: + - the 'mysql' service is in 'OK' state, + - the 'mysql-nodes.mysql-fs' service is in 'CRITICAL' + state for the node. + 7. Connect to a second controller node using ssh and run: + fallocate -l $(df | grep /dev/mapper/mysql-root + | awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * + 98 / 100) - $3))}') /var/lib/mysql/test + 8. Wait for at least 1 minute. + 9. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'CRIT' with an orange background, + - an annotation telling that the service went from 'OKAY' + to 'WARN' is displayed. + 10. On Nagios, check the following items: + - the 'mysql' service is in 'CRITICAL' state, + - the 'mysql-nodes.mysql-fs' service is in 'CRITICAL' + state for the 2 nodes, + - the local user root on the lma node has received an + email about the service + being in warning state. + 11. Run the following command on both controller nodes: + rm /var/lib/mysql/test + 12. Wait for at least 1 minutes. + 13. On Grafana, check the following items: + - the box in the upper left corner of the dashboard + displays 'OKAY' with an green background, + - an annotation telling that the service went from 'CRIT' + to 'OKAY' is displayed. + 14. On Nagios, check the following items: + - the 'mysql' service is in OK' state, + - the 'mysql-nodes.mysql-fs' service is in 'OKAY' state + for the 2 nodes, + - the local user root on the lma node has received an + email about the recovery of the service. + + Duration 15m + """ + self.env.revert_snapshot("deploy_ha_toolchain") + + lma_devops_node = self.helpers.get_node_with_vip( + self.settings.stacklight_roles, + self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip")) + toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node( + lma_devops_node) + nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles( + self.helpers.cluster_id, ['controller']) + + url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url() + with self.ui_tester.ui_driver(url, "//frame[2]", + "Nagios Core") as driver: + self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page( + driver, 'Services', "//table[@class='headertable']") + self.change_verify_node_service_state( + ['mysql', 'mysql-nodes.mysql-fs'], 'CRITICAL', + self.settings.UNKW, '98', toolchain_node, + [nailgun_nodes[0], nailgun_nodes[1]], driver) diff --git a/stacklight_tests/toolchain/toolchain_settings.py b/stacklight_tests/toolchain/toolchain_settings.py index 2d77b5a..c7ee014 100644 --- a/stacklight_tests/toolchain/toolchain_settings.py +++ b/stacklight_tests/toolchain/toolchain_settings.py @@ -27,6 +27,12 @@ stacklight_roles = (elasticsearch_settings.role_name + collector_settings.role_name + infrastructure_alerting_settings.role_name) +OKAY = 0 +WARN = 1 +UNKW = 2 +CRIT = 3 +DOWN = 4 + base_nodes = { 'slave-01': ['controller'], 'slave-02': ['compute', 'cinder'],