Add four toolchain functional tests

Add four toolchain functional tests.
Add toolchain helpers methods.

Change-Id: I0ea3613d8e8e44a2dfa9d5a9bea26c0e9b793ee4
This commit is contained in:
Vladimir Ushakov 2016-06-16 13:08:39 +03:00
parent 6b41a46fad
commit e4b4ef93de
8 changed files with 639 additions and 18 deletions

View File

@ -15,6 +15,8 @@
from contextlib import closing
import socket
from devops.error import DevopsCalledProcessError
from devops.helpers import helpers as devops_helpers
from proboscis import asserts
import requests
from requests.packages.urllib3 import poolmanager
@ -83,3 +85,32 @@ def check_port(address, port):
"""
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
return sock.connect_ex((address, port)) == 0
def check_local_mail(remote, node_name, service, state, timeout=10 * 60):
"""Check that email from LMA Infrastructure Alerting plugin about service
changing it's state is presented on a host.
:param remote: SSH connection to the node.
:type remote: SSHClient
:param node_name: name of the node to check for email on.
:type node_name: str
:param message: message to look for.
:type message: str
:param timeout: timeout to wait for email to arrive.
:rtype timeout: int
"""
def check_mail():
try:
responce = remote.check_call("cat $MAIL")
if not responce:
return False
if ("Service: {}\n".format(service) in responce['stdout'] and
"State: {}\n".format(state) in responce['stdout']):
return True
except DevopsCalledProcessError:
return False
msg = ("Email about service {0} in {1} state was not "
"found on {2} after {3} seconds").format(
service, state, node_name, timeout)
devops_helpers.wait(check_mail, timeout=timeout, timeout_msg=msg)

View File

@ -576,7 +576,7 @@ class PluginHelper(object):
for service in ha_services:
remote_ops.manage_pacemaker_service(remote, service)
for service in non_ha_services:
remote_ops.manage_initctl_service(remote, service)
remote_ops.manage_service(remote, service)
logger.info("Restarting services on computes")
compute_services = (
@ -586,7 +586,7 @@ class PluginHelper(object):
for compute in computes:
with self.fuel_web.get_ssh_for_nailgun_node(compute) as remote:
for service in compute_services:
remote_ops.manage_initctl_service(remote, service)
remote_ops.manage_service(remote, service)
@staticmethod
def check_notifications(got_list, expected_list):

View File

@ -129,7 +129,7 @@ def manage_pacemaker_service(remote, name, operation="restart"):
operation=operation, service=name))
def manage_initctl_service(remote, name, operation="restart"):
def manage_service(remote, name, operation="restart"):
"""Operate service on remote node.
:param remote: SSH connection to the node.
@ -139,8 +139,24 @@ def manage_initctl_service(remote, name, operation="restart"):
:param operation: type of operation, usually start, stop or restart.
:type operation: str
"""
remote.check_call("initctl {operation} {service}".format(
operation=operation, service=name))
if remote.execute("service {} status".format(name))['exit_code'] == 0:
service_cmd = 'service {service} {operation}'
elif remote.execute("initctl status {}".format(name))['exit_code'] == 0:
service_cmd = 'initctl {operation} {service}'
else:
raise Exception('no service handler!')
remote.check_call(service_cmd.format(service=name, operation=operation))
def clear_local_mail(remote):
"""Clean local mail
:param remote: SSH connection to the node.
:type remote: SSHClient
"""
remote.check_call("rm -f $MAIL")
def fill_up_filesystem(remote, fs, percent, file_name):

View File

@ -22,6 +22,10 @@ from stacklight_tests.influxdb_grafana.grafana_ui import api as ui_api
from stacklight_tests.influxdb_grafana import plugin_settings
class NotFound(Exception):
pass
class InfluxdbPluginApi(base_test.PluginApi):
def __init__(self):
super(InfluxdbPluginApi, self).__init__()
@ -179,3 +183,35 @@ class InfluxdbPluginApi(base_test.PluginApi):
if result:
return result["series"][0]["values"]
return []
def check_cluster_status(self, name, expected_status, interval='3m'):
output = ("SELECT last(value) FROM cluster_status WHERE "
"time > now() - {0} AND cluster_name='{1}'".format(interval,
name))
msg_header = "Wrong '{0}' service state has been found!".format(
name)
self._check_influx_query_last_value(output, expected_status,
msg_header)
def check_count_of_haproxy_backends(self, service, node_state='down',
expected_count=0, interval='3m'):
query = ("SELECT last(value) FROM haproxy_backend_servers WHERE "
"backend='{0}' AND state='{1}' and "
"time > now() - {2}".format(service, node_state, interval))
msg_header = ("Wrong amout of nodes with service '{0}' "
"in '{1}' state!".format(service, node_state))
self._check_influx_query_last_value(query, expected_count, msg_header)
def _check_influx_query_last_value(self, query, expected_value,
msg_header):
output = self.do_influxdb_query(query)
lines = output.json()
if not lines['results'][0]:
logger.error("The query ['result'] is empty!")
raise NotFound
state = lines['results'][0]['series'][0]['values'][0][1]
asserts.assert_equal(expected_value, state,
msg_header + " Expected {0} but"
" found {1}".format(expected_value, state))

View File

@ -11,10 +11,13 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import six.moves as sm
from devops.helpers import helpers
from fuelweb_test import logger
from proboscis import asserts
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
@ -90,9 +93,7 @@ class InfraAlertingPluginApi(base_test.PluginApi):
return "{0}://{1}:{2}".format(self.nagios_protocol,
self.get_nagios_vip(), self.nagios_port)
def open_nagios_page(self, link_text, anchor):
driver = self.ui_tester.get_driver(self.get_authenticated_nagios_url(),
"//frame[2]", "Nagios Core")
def open_nagios_page(self, driver, link_text, anchor):
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("side"))
link = driver.find_element_by_link_text(link_text)
@ -104,19 +105,19 @@ class InfraAlertingPluginApi(base_test.PluginApi):
return driver
def check_node_in_nagios(self, changed_node, state):
driver = self.open_nagios_page(
'Hosts', "//table[@class='headertable']")
try:
with self.ui_tester.ui_driver(
self.get_authenticated_nagios_url(),
"//frame[2]", "Nagios Core") as driver:
driver = self.open_nagios_page(
driver, 'Hosts', "//table[@class='headertable']")
asserts.assert_equal(state, self.node_is_present(
driver, changed_node), "Failed to find node '{0}' on nagios!"
.format(changed_node))
finally:
driver.close()
driver, changed_node), "Failed to find node '{0}' "
"on nagios!".format(changed_node))
def node_is_present(self, driver, name):
table = self.ui_tester.get_table(driver,
"/html/body/div[2]/table/tbody")
for ind in xrange(2, self.ui_tester.get_table_size(table) + 1):
for ind in sm.xrange(2, self.ui_tester.get_table_size(table) + 1):
node_name = self.ui_tester.get_table_cell(
table, ind, 1).text.rstrip()
if name == node_name:
@ -131,3 +132,71 @@ class InfraAlertingPluginApi(base_test.PluginApi):
def check_uninstall_failure(self):
return self.helpers.check_plugin_cannot_be_uninstalled(
self.settings.name, self.settings.version)
def get_services_for_node(self, table, node_name, driver,
table_xpath="/html/body/table[3]/tbody"):
services = {}
found_node = False
ind = 2
while ind < self.ui_tester.get_table_size(table) + 1:
try:
if not self.ui_tester.get_table_row(table, ind).text:
if found_node:
break
else:
continue
if self.ui_tester.get_table_cell(
table, ind, 1).text == node_name:
found_node = True
if found_node:
services[self.ui_tester.get_table_cell(
table, ind, 2).text] = (
self.ui_tester.get_table_cell(table, ind, 3).text)
except StaleElementReferenceException:
table = self.ui_tester.get_table(driver, table_xpath)
ind -= 1
ind += 1
return services
def check_service_state_on_nagios(self, driver, service_state=None,
node_names=None):
self.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
table = self.ui_tester.get_table(driver, "/html/body/table[3]/tbody")
if not node_names:
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
for node in node_names:
node_services = self.get_services_for_node(table, node, driver)
if service_state:
for service in service_state:
if service_state[service] != node_services[service]:
return False
else:
for service in node_services:
if 'OK' != node_services[service]:
return False
return True
def wait_service_state_on_nagios(self, driver, service_state=None,
node_names=None):
msg = ("Fail to get expected service states for services: {0} "
"on nodes: {1}")
if not service_state or not node_names:
self.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
table = self.ui_tester.get_table(driver,
"/html/body/table[3]/tbody")
if not node_names:
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
if not service_state:
service_state = dict((key, 'OK') for key in
self.get_services_for_node(
table, node_names[0], driver))
msg = msg.format([key for key in service_state], node_names)
helpers.wait(lambda: self.check_service_state_on_nagios(
driver, service_state, node_names), timeout=60 * 5,
timeout_msg=msg)

View File

@ -45,8 +45,8 @@ class ToolchainApi(object):
self.ELASTICSEARCH_KIBANA = elasticsearch_api.ElasticsearchPluginApi()
self.INFLUXDB_GRAFANA = influx_api.InfluxdbPluginApi()
self.LMA_COLLECTOR = collector_api.LMACollectorPluginApi()
self.LMA_INFRASTRUCTURE_ALERTING = \
infrastructure_alerting_api.InfraAlertingPluginApi()
self.LMA_INFRASTRUCTURE_ALERTING = (
infrastructure_alerting_api.InfraAlertingPluginApi())
self._plugins = {
self.ELASTICSEARCH_KIBANA,
self.INFLUXDB_GRAFANA,
@ -373,3 +373,132 @@ class ToolchainApi(object):
msg = "Failed to set vm_memory_high_watermark to {}".format(limit)
devops_helpers.wait(check_result, timeout=timeout,
interval=10, timeout_msg=msg)
def change_verify_service_state(self, service_name, action, new_state,
service_state_in_influx,
down_backends_in_haproxy, toolchain_node,
controller_nodes, nagios_driver):
"""Verify that the alerts for services show up in the Grafana
and Nagios UI.
:param service_name: name of the service to change state of.
Format [service name, service name
on dashboard] e.g. ['nova-api', 'nova']
:type service_name: list.
:param action: action to perform (e.g. stop, start).
:type action: str
:param new_state: new state of the service.
:type new_state: str
:param service_state_in_influx: new state of the service in influx.
:type new_state: int
:param down_backends_in_haproxy: amout of backends in 'down' state.
:type down_backends_in_haproxy: int
:param toolchain_node: toolchain node with
infrastructure_alerting_ui vip.
:type toolchain_node: dict
:param controller_nodes: list of the controller nodes to change
service state on.
:type controller_nodes: list
:param nagios_driver: selenium web driver
service state on.
:type nagios_driver: WebDriver
"""
logger.info("Changing state of service {0}. "
"New state is {1}".format(service_name[0], new_state))
with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote:
self.remote_ops.clear_local_mail(remote)
for node in controller_nodes:
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
node) as remote:
self.remote_ops.manage_service(remote, service_name[0], action)
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {service_name[1]: new_state})
self.INFLUXDB_GRAFANA.check_cluster_status(
service_name[1], service_state_in_influx)
self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends(
service_name[0], expected_count=down_backends_in_haproxy)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
self.checkers.check_local_mail(
remote, toolchain_node["name"], service_name[1], new_state)
def change_verify_node_service_state(self, services, state, influx_state,
percent, toolchain_node,
controller_nodes, nagios_driver):
"""Verify that the alerts for nodes show up in the Grafana
and Nagios UI.
:param services: list of services to check new status of. Format
['mysql', 'mysql-nodes.mysql-fs']
:type services: list
:param state: new state of the service.
:type state: str
:param influx_state: new influx state.
:type influx_state: int
:param percent: amount of space to be filled on a node.
:type percent: int
:param toolchain_node: toolchain node with
infrastructure_alerting_ui vip.
:type toolchain_node: dict
:param controller_nodes: list of the controller nodes to change
service state on.
:type controller_nodes: list
:param nagios_driver: selenium web driver
service state on.
:type nagios_driver: WebDriver
"""
with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote:
self.remote_ops.clear_local_mail(remote)
with self.fuel_web.get_ssh_for_nailgun_node(
controller_nodes[0]) as remote:
self.remote_ops.fill_up_filesystem(
remote, "/dev/mapper/mysql-root", percent,
"/var/lib/mysql/test/bigfile")
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: 'OK'})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: state},
[controller_nodes[0]['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
self.settings.OKAY)
with self.fuel_web.get_ssh_for_nailgun_node(
controller_nodes[1]) as remote:
self.remote_ops.fill_up_filesystem(
remote, "/dev/mapper/mysql-root", percent,
"/var/lib/mysql/test/bigfile")
for node in controller_nodes:
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: state})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: state}, [node['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0], influx_state)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
self.checkers.check_local_mail(
remote, toolchain_node["name"], services[0], state)
for node in controller_nodes:
with self.fuel_web.get_ssh_for_nailgun_node(node) as remote:
self.remote_ops.clean_filesystem(remote,
"/var/lib/mysql/test/bigfile")
for node in controller_nodes:
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[0]: 'OK'})
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
nagios_driver, {services[1]: 'OK'}, [node['hostname']])
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
self.settings.OKAY)
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
toolchain_node) as remote:
self.checkers.check_local_mail(
remote, toolchain_node["name"], services[0], 'OK')

View File

@ -13,6 +13,7 @@
# under the License.
from fuelweb_test.helpers.decorators import log_snapshot_after_test
from fuelweb_test import logger
from proboscis import test
from stacklight_tests.toolchain import api
@ -234,3 +235,336 @@ class TestFunctionalToolchain(api.ToolchainApi):
self.check_plugins_online()
self.check_cinder_notifications()
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_warning_alert_service", "service_restart",
"toolchain", "functional"])
@log_snapshot_after_test
def toolchain_warning_alert_service(self):
"""Verify that the warning alerts for services show up in the
Grafana and Nagios UI.
Scenario:
1. Connect to one of the controller nodes using ssh and
stop the nova-api service.
2. Wait for at least 1 minute.
3. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'WARN' with an orange background,
- the API panels report 1 entity as down.
4. On Nagios, check the following items:
- the 'nova' service is in 'WARNING' state,
- the local user root on the lma node has received
an email about the service
being in warning state.
5. Restart the nova-api service.
6. Wait for at least 1 minute.
7. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'OKAY' with an green background,
- the API panels report 0 entity as down.
8. On Nagios, check the following items:
- the 'nova' service is in 'OK' state,
- the local user root on the lma node has received
an email about the recovery
of the service.
9. Repeat steps 2 to 8 for the following services:
- Nova (stopping and starting the nova-api and
nova-scheduler)
- Cinder (stopping and starting the cinder-api and
cinder-scheduler services respectively).
- Neutron (stopping and starting the neutron-server
and neutron-openvswitch-agent services respectively).
- Glance (stopping and starting the glance-api service).
- Heat (stopping and starting the heat-api service).
- Keystone (stopping and starting the Apache service).
Duration 45m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
services = {
'nova': ['nova-api', 'nova-scheduler'],
'cinder': ['cinder-api', 'cinder-scheduler'],
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
'glance': ['glance-api'],
'heat': ['heat-api'],
'keystone': ['apache2']
}
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name(
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "//frame[2]",
"Nagios Core") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
controller_node = (
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])[0])
for key in services:
for service in services[key]:
self.change_verify_service_state(
service_name=[service, key], action='stop',
new_state='WARNING',
service_state_in_influx=self.settings.WARN,
down_backends_in_haproxy=1,
toolchain_node=toolchain_node,
controller_nodes=[controller_node],
nagios_driver=driver)
self.change_verify_service_state(
service_name=[service, key], action='start',
new_state='OK',
service_state_in_influx=self.settings.OKAY,
down_backends_in_haproxy=0,
toolchain_node=toolchain_node,
controller_nodes=[controller_node],
nagios_driver=driver)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_critical_alert_service", "service_restart",
"toolchain", "functional"])
# @log_snapshot_after_test
def toolchain_critical_alert_service(self):
"""Verify that the critical alerts for services show up in
the Grafana and Nagios UI.
Scenario:
1. Open the Nagios URL
2. Connect to one of the controller nodes using ssh and
stop the nova-api service.
3. Connect to a second controller node using ssh and stop
the nova-api service.
4. Wait for at least 1 minute.
5. On Nagios, check the following items:
- the 'nova' service is in 'WARNING' state,
- the local user root on the lma node has received
an email about the service
being in warning state.
6. Restart the nova-api service on both nodes.
7. Wait for at least 1 minute.
8. On Nagios, check the following items:
- the 'nova' service is in 'OK' state,
- the local user root on the lma node has received
an email about the recovery
of the service.
9. Repeat steps 2 to 8 for the following services:
- Nova (stopping and starting the nova-api and
nova-scheduler)
- Cinder (stopping and starting the cinder-api and
cinder-scheduler services respectively).
- Neutron (stopping and starting the neutron-server
and neutron-openvswitch-agent services respectively).
- Glance (stopping and starting the glance-api service).
- Heat (stopping and starting the heat-api service).
- Keystone (stopping and starting the Apache service).
Duration 45m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
services = {
'nova': ['nova-api', 'nova-scheduler'],
'cinder': ['cinder-api', 'cinder-scheduler'],
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
'glance': ['glance-api'],
'heat': ['heat-api'],
'keystone': ['apache2']
}
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name(
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "//frame[2]",
"Nagios Core") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
controller_nodes = (
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller']))
for key in services:
for service in services[key]:
logger.info("Checking service {0}".format(service))
self.change_verify_service_state(
service_name=[service, key], action='stop',
new_state='CRITICAL',
service_state_in_influx=self.settings.CRIT,
down_backends_in_haproxy=2,
toolchain_node=toolchain_node,
controller_nodes=[controller_nodes[0],
controller_nodes[1]],
nagios_driver=driver)
self.change_verify_service_state(
service_name=[service, key], action='start',
new_state='OK',
service_state_in_influx=self.settings.OKAY,
down_backends_in_haproxy=0,
toolchain_node=toolchain_node,
controller_nodes=[controller_nodes[0],
controller_nodes[1]],
nagios_driver=driver)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_warning_alert_node", "node_alert_warning",
"toolchain", "functional"])
@log_snapshot_after_test
def toolchain_warning_alert_node(self):
"""Verify that the warning alerts for nodes show up in the
Grafana and Nagios UI.
Scenario:
1. Open the Nagios URL
2. Open the Grafana URl
3. Connect to one of the controller nodes using ssh and
run:
fallocate -l $(df | grep /dev/mapper/mysql-root
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96
/ 100) - $3))}') /var/lib/mysql/test
4. Wait for at least 1 minute.
5. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'OKAY' with an green background,
6. On Nagios, check the following items:
- the 'mysql' service is in 'OK' state,
- the 'mysql-nodes.mysql-fs' service is in 'WARNING'
state for the node.
7. Connect to a second controller node using ssh and run:
fallocate -l $(df | grep /dev/mapper/mysql-root
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96
/ 100) - $3))}') /var/lib/mysql/test
8. Wait for at least 1 minute.
9. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'WARN' with an orange background,
- an annotation telling that the service went from 'OKAY'
to 'WARN' is displayed.
10. On Nagios, check the following items:
- the 'mysql' service is in 'WARNING' state,
- the 'mysql-nodes.mysql-fs' service is in 'WARNING'
state for the 2 nodes,
- the local user root on the lma node has received an
email about the service
being in warning state.
11. Run the following command on both controller nodes:
rm /var/lib/mysql/test
12. Wait for at least 1 minutes.
13. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'OKAY' with an green background,
- an annotation telling that the service went from 'WARN'
to 'OKAY' is displayed.
14. On Nagios, check the following items:
- the 'mysql' service is in 'OK' state,
- the 'mysql-nodes.mysql-fs' service is in 'OKAY' state
for the 2 nodes,
- the local user root on the lma node has received an
email about the recovery of the service.
Duration 15m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "//frame[2]",
"Nagios Core") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
self.change_verify_node_service_state(
['mysql', 'mysql-nodes.mysql-fs'], 'WARNING',
self.settings.WARN, '96', toolchain_node,
[nailgun_nodes[0], nailgun_nodes[1]], driver)
@test(depends_on_groups=["deploy_ha_toolchain"],
groups=["toolchain_critical_alert_node", "node_alert_critical",
"toolchain", "functional"])
@log_snapshot_after_test
def toolchain_critical_alert_node(self):
"""Verify that the critical alerts for nodes show up in the
Grafana and Nagios UI.
Scenario:
1. Open the Nagios URL
2. Open the Grafana URl
3. Connect to one of the controller nodes using ssh and run:
fallocate -l $(df | grep /dev/mapper/mysql-root
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) *
98 / 100) - $3))}') /var/lib/mysql/test
4. Wait for at least 1 minute.
5. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'OKAY' with an green background,
6. On Nagios, check the following items:
- the 'mysql' service is in 'OK' state,
- the 'mysql-nodes.mysql-fs' service is in 'CRITICAL'
state for the node.
7. Connect to a second controller node using ssh and run:
fallocate -l $(df | grep /dev/mapper/mysql-root
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) *
98 / 100) - $3))}') /var/lib/mysql/test
8. Wait for at least 1 minute.
9. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'CRIT' with an orange background,
- an annotation telling that the service went from 'OKAY'
to 'WARN' is displayed.
10. On Nagios, check the following items:
- the 'mysql' service is in 'CRITICAL' state,
- the 'mysql-nodes.mysql-fs' service is in 'CRITICAL'
state for the 2 nodes,
- the local user root on the lma node has received an
email about the service
being in warning state.
11. Run the following command on both controller nodes:
rm /var/lib/mysql/test
12. Wait for at least 1 minutes.
13. On Grafana, check the following items:
- the box in the upper left corner of the dashboard
displays 'OKAY' with an green background,
- an annotation telling that the service went from 'CRIT'
to 'OKAY' is displayed.
14. On Nagios, check the following items:
- the 'mysql' service is in OK' state,
- the 'mysql-nodes.mysql-fs' service is in 'OKAY' state
for the 2 nodes,
- the local user root on the lma node has received an
email about the recovery of the service.
Duration 15m
"""
self.env.revert_snapshot("deploy_ha_toolchain")
lma_devops_node = self.helpers.get_node_with_vip(
self.settings.stacklight_roles,
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
lma_devops_node)
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
self.helpers.cluster_id, ['controller'])
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
with self.ui_tester.ui_driver(url, "//frame[2]",
"Nagios Core") as driver:
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
driver, 'Services', "//table[@class='headertable']")
self.change_verify_node_service_state(
['mysql', 'mysql-nodes.mysql-fs'], 'CRITICAL',
self.settings.UNKW, '98', toolchain_node,
[nailgun_nodes[0], nailgun_nodes[1]], driver)

View File

@ -27,6 +27,12 @@ stacklight_roles = (elasticsearch_settings.role_name +
collector_settings.role_name +
infrastructure_alerting_settings.role_name)
OKAY = 0
WARN = 1
UNKW = 2
CRIT = 3
DOWN = 4
base_nodes = {
'slave-01': ['controller'],
'slave-02': ['compute', 'cinder'],