Add four toolchain functional tests
Add four toolchain functional tests. Add toolchain helpers methods. Change-Id: I0ea3613d8e8e44a2dfa9d5a9bea26c0e9b793ee4
This commit is contained in:
parent
6b41a46fad
commit
e4b4ef93de
@ -15,6 +15,8 @@
|
||||
from contextlib import closing
|
||||
import socket
|
||||
|
||||
from devops.error import DevopsCalledProcessError
|
||||
from devops.helpers import helpers as devops_helpers
|
||||
from proboscis import asserts
|
||||
import requests
|
||||
from requests.packages.urllib3 import poolmanager
|
||||
@ -83,3 +85,32 @@ def check_port(address, port):
|
||||
"""
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
return sock.connect_ex((address, port)) == 0
|
||||
|
||||
|
||||
def check_local_mail(remote, node_name, service, state, timeout=10 * 60):
|
||||
"""Check that email from LMA Infrastructure Alerting plugin about service
|
||||
changing it's state is presented on a host.
|
||||
|
||||
:param remote: SSH connection to the node.
|
||||
:type remote: SSHClient
|
||||
:param node_name: name of the node to check for email on.
|
||||
:type node_name: str
|
||||
:param message: message to look for.
|
||||
:type message: str
|
||||
:param timeout: timeout to wait for email to arrive.
|
||||
:rtype timeout: int
|
||||
"""
|
||||
def check_mail():
|
||||
try:
|
||||
responce = remote.check_call("cat $MAIL")
|
||||
if not responce:
|
||||
return False
|
||||
if ("Service: {}\n".format(service) in responce['stdout'] and
|
||||
"State: {}\n".format(state) in responce['stdout']):
|
||||
return True
|
||||
except DevopsCalledProcessError:
|
||||
return False
|
||||
msg = ("Email about service {0} in {1} state was not "
|
||||
"found on {2} after {3} seconds").format(
|
||||
service, state, node_name, timeout)
|
||||
devops_helpers.wait(check_mail, timeout=timeout, timeout_msg=msg)
|
||||
|
@ -576,7 +576,7 @@ class PluginHelper(object):
|
||||
for service in ha_services:
|
||||
remote_ops.manage_pacemaker_service(remote, service)
|
||||
for service in non_ha_services:
|
||||
remote_ops.manage_initctl_service(remote, service)
|
||||
remote_ops.manage_service(remote, service)
|
||||
|
||||
logger.info("Restarting services on computes")
|
||||
compute_services = (
|
||||
@ -586,7 +586,7 @@ class PluginHelper(object):
|
||||
for compute in computes:
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(compute) as remote:
|
||||
for service in compute_services:
|
||||
remote_ops.manage_initctl_service(remote, service)
|
||||
remote_ops.manage_service(remote, service)
|
||||
|
||||
@staticmethod
|
||||
def check_notifications(got_list, expected_list):
|
||||
|
@ -129,7 +129,7 @@ def manage_pacemaker_service(remote, name, operation="restart"):
|
||||
operation=operation, service=name))
|
||||
|
||||
|
||||
def manage_initctl_service(remote, name, operation="restart"):
|
||||
def manage_service(remote, name, operation="restart"):
|
||||
"""Operate service on remote node.
|
||||
|
||||
:param remote: SSH connection to the node.
|
||||
@ -139,8 +139,24 @@ def manage_initctl_service(remote, name, operation="restart"):
|
||||
:param operation: type of operation, usually start, stop or restart.
|
||||
:type operation: str
|
||||
"""
|
||||
remote.check_call("initctl {operation} {service}".format(
|
||||
operation=operation, service=name))
|
||||
|
||||
if remote.execute("service {} status".format(name))['exit_code'] == 0:
|
||||
service_cmd = 'service {service} {operation}'
|
||||
elif remote.execute("initctl status {}".format(name))['exit_code'] == 0:
|
||||
service_cmd = 'initctl {operation} {service}'
|
||||
else:
|
||||
raise Exception('no service handler!')
|
||||
|
||||
remote.check_call(service_cmd.format(service=name, operation=operation))
|
||||
|
||||
|
||||
def clear_local_mail(remote):
|
||||
"""Clean local mail
|
||||
|
||||
:param remote: SSH connection to the node.
|
||||
:type remote: SSHClient
|
||||
"""
|
||||
remote.check_call("rm -f $MAIL")
|
||||
|
||||
|
||||
def fill_up_filesystem(remote, fs, percent, file_name):
|
||||
|
@ -22,6 +22,10 @@ from stacklight_tests.influxdb_grafana.grafana_ui import api as ui_api
|
||||
from stacklight_tests.influxdb_grafana import plugin_settings
|
||||
|
||||
|
||||
class NotFound(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InfluxdbPluginApi(base_test.PluginApi):
|
||||
def __init__(self):
|
||||
super(InfluxdbPluginApi, self).__init__()
|
||||
@ -179,3 +183,35 @@ class InfluxdbPluginApi(base_test.PluginApi):
|
||||
if result:
|
||||
return result["series"][0]["values"]
|
||||
return []
|
||||
|
||||
def check_cluster_status(self, name, expected_status, interval='3m'):
|
||||
output = ("SELECT last(value) FROM cluster_status WHERE "
|
||||
"time > now() - {0} AND cluster_name='{1}'".format(interval,
|
||||
name))
|
||||
msg_header = "Wrong '{0}' service state has been found!".format(
|
||||
name)
|
||||
self._check_influx_query_last_value(output, expected_status,
|
||||
msg_header)
|
||||
|
||||
def check_count_of_haproxy_backends(self, service, node_state='down',
|
||||
expected_count=0, interval='3m'):
|
||||
|
||||
query = ("SELECT last(value) FROM haproxy_backend_servers WHERE "
|
||||
"backend='{0}' AND state='{1}' and "
|
||||
"time > now() - {2}".format(service, node_state, interval))
|
||||
|
||||
msg_header = ("Wrong amout of nodes with service '{0}' "
|
||||
"in '{1}' state!".format(service, node_state))
|
||||
self._check_influx_query_last_value(query, expected_count, msg_header)
|
||||
|
||||
def _check_influx_query_last_value(self, query, expected_value,
|
||||
msg_header):
|
||||
output = self.do_influxdb_query(query)
|
||||
lines = output.json()
|
||||
if not lines['results'][0]:
|
||||
logger.error("The query ['result'] is empty!")
|
||||
raise NotFound
|
||||
state = lines['results'][0]['series'][0]['values'][0][1]
|
||||
asserts.assert_equal(expected_value, state,
|
||||
msg_header + " Expected {0} but"
|
||||
" found {1}".format(expected_value, state))
|
||||
|
@ -11,10 +11,13 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
import six.moves as sm
|
||||
|
||||
from devops.helpers import helpers
|
||||
from fuelweb_test import logger
|
||||
from proboscis import asserts
|
||||
|
||||
from selenium.common.exceptions import StaleElementReferenceException
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
@ -90,9 +93,7 @@ class InfraAlertingPluginApi(base_test.PluginApi):
|
||||
return "{0}://{1}:{2}".format(self.nagios_protocol,
|
||||
self.get_nagios_vip(), self.nagios_port)
|
||||
|
||||
def open_nagios_page(self, link_text, anchor):
|
||||
driver = self.ui_tester.get_driver(self.get_authenticated_nagios_url(),
|
||||
"//frame[2]", "Nagios Core")
|
||||
def open_nagios_page(self, driver, link_text, anchor):
|
||||
driver.switch_to.default_content()
|
||||
driver.switch_to.frame(driver.find_element_by_name("side"))
|
||||
link = driver.find_element_by_link_text(link_text)
|
||||
@ -104,19 +105,19 @@ class InfraAlertingPluginApi(base_test.PluginApi):
|
||||
return driver
|
||||
|
||||
def check_node_in_nagios(self, changed_node, state):
|
||||
driver = self.open_nagios_page(
|
||||
'Hosts', "//table[@class='headertable']")
|
||||
try:
|
||||
with self.ui_tester.ui_driver(
|
||||
self.get_authenticated_nagios_url(),
|
||||
"//frame[2]", "Nagios Core") as driver:
|
||||
driver = self.open_nagios_page(
|
||||
driver, 'Hosts', "//table[@class='headertable']")
|
||||
asserts.assert_equal(state, self.node_is_present(
|
||||
driver, changed_node), "Failed to find node '{0}' on nagios!"
|
||||
.format(changed_node))
|
||||
finally:
|
||||
driver.close()
|
||||
driver, changed_node), "Failed to find node '{0}' "
|
||||
"on nagios!".format(changed_node))
|
||||
|
||||
def node_is_present(self, driver, name):
|
||||
table = self.ui_tester.get_table(driver,
|
||||
"/html/body/div[2]/table/tbody")
|
||||
for ind in xrange(2, self.ui_tester.get_table_size(table) + 1):
|
||||
for ind in sm.xrange(2, self.ui_tester.get_table_size(table) + 1):
|
||||
node_name = self.ui_tester.get_table_cell(
|
||||
table, ind, 1).text.rstrip()
|
||||
if name == node_name:
|
||||
@ -131,3 +132,71 @@ class InfraAlertingPluginApi(base_test.PluginApi):
|
||||
def check_uninstall_failure(self):
|
||||
return self.helpers.check_plugin_cannot_be_uninstalled(
|
||||
self.settings.name, self.settings.version)
|
||||
|
||||
def get_services_for_node(self, table, node_name, driver,
|
||||
table_xpath="/html/body/table[3]/tbody"):
|
||||
services = {}
|
||||
found_node = False
|
||||
ind = 2
|
||||
while ind < self.ui_tester.get_table_size(table) + 1:
|
||||
try:
|
||||
if not self.ui_tester.get_table_row(table, ind).text:
|
||||
if found_node:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
if self.ui_tester.get_table_cell(
|
||||
table, ind, 1).text == node_name:
|
||||
found_node = True
|
||||
if found_node:
|
||||
services[self.ui_tester.get_table_cell(
|
||||
table, ind, 2).text] = (
|
||||
self.ui_tester.get_table_cell(table, ind, 3).text)
|
||||
except StaleElementReferenceException:
|
||||
table = self.ui_tester.get_table(driver, table_xpath)
|
||||
ind -= 1
|
||||
ind += 1
|
||||
|
||||
return services
|
||||
|
||||
def check_service_state_on_nagios(self, driver, service_state=None,
|
||||
node_names=None):
|
||||
self.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
table = self.ui_tester.get_table(driver, "/html/body/table[3]/tbody")
|
||||
if not node_names:
|
||||
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
|
||||
for node in node_names:
|
||||
node_services = self.get_services_for_node(table, node, driver)
|
||||
if service_state:
|
||||
for service in service_state:
|
||||
if service_state[service] != node_services[service]:
|
||||
return False
|
||||
else:
|
||||
for service in node_services:
|
||||
if 'OK' != node_services[service]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_service_state_on_nagios(self, driver, service_state=None,
|
||||
node_names=None):
|
||||
msg = ("Fail to get expected service states for services: {0} "
|
||||
"on nodes: {1}")
|
||||
|
||||
if not service_state or not node_names:
|
||||
self.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
table = self.ui_tester.get_table(driver,
|
||||
"/html/body/table[3]/tbody")
|
||||
if not node_names:
|
||||
node_names = [self.ui_tester.get_table_cell(table, 2, 1).text]
|
||||
if not service_state:
|
||||
service_state = dict((key, 'OK') for key in
|
||||
self.get_services_for_node(
|
||||
table, node_names[0], driver))
|
||||
|
||||
msg = msg.format([key for key in service_state], node_names)
|
||||
|
||||
helpers.wait(lambda: self.check_service_state_on_nagios(
|
||||
driver, service_state, node_names), timeout=60 * 5,
|
||||
timeout_msg=msg)
|
||||
|
@ -45,8 +45,8 @@ class ToolchainApi(object):
|
||||
self.ELASTICSEARCH_KIBANA = elasticsearch_api.ElasticsearchPluginApi()
|
||||
self.INFLUXDB_GRAFANA = influx_api.InfluxdbPluginApi()
|
||||
self.LMA_COLLECTOR = collector_api.LMACollectorPluginApi()
|
||||
self.LMA_INFRASTRUCTURE_ALERTING = \
|
||||
infrastructure_alerting_api.InfraAlertingPluginApi()
|
||||
self.LMA_INFRASTRUCTURE_ALERTING = (
|
||||
infrastructure_alerting_api.InfraAlertingPluginApi())
|
||||
self._plugins = {
|
||||
self.ELASTICSEARCH_KIBANA,
|
||||
self.INFLUXDB_GRAFANA,
|
||||
@ -373,3 +373,132 @@ class ToolchainApi(object):
|
||||
msg = "Failed to set vm_memory_high_watermark to {}".format(limit)
|
||||
devops_helpers.wait(check_result, timeout=timeout,
|
||||
interval=10, timeout_msg=msg)
|
||||
|
||||
def change_verify_service_state(self, service_name, action, new_state,
|
||||
service_state_in_influx,
|
||||
down_backends_in_haproxy, toolchain_node,
|
||||
controller_nodes, nagios_driver):
|
||||
"""Verify that the alerts for services show up in the Grafana
|
||||
and Nagios UI.
|
||||
|
||||
:param service_name: name of the service to change state of.
|
||||
Format [service name, service name
|
||||
on dashboard] e.g. ['nova-api', 'nova']
|
||||
:type service_name: list.
|
||||
:param action: action to perform (e.g. stop, start).
|
||||
:type action: str
|
||||
:param new_state: new state of the service.
|
||||
:type new_state: str
|
||||
:param service_state_in_influx: new state of the service in influx.
|
||||
:type new_state: int
|
||||
:param down_backends_in_haproxy: amout of backends in 'down' state.
|
||||
:type down_backends_in_haproxy: int
|
||||
:param toolchain_node: toolchain node with
|
||||
infrastructure_alerting_ui vip.
|
||||
:type toolchain_node: dict
|
||||
:param controller_nodes: list of the controller nodes to change
|
||||
service state on.
|
||||
:type controller_nodes: list
|
||||
:param nagios_driver: selenium web driver
|
||||
service state on.
|
||||
:type nagios_driver: WebDriver
|
||||
"""
|
||||
|
||||
logger.info("Changing state of service {0}. "
|
||||
"New state is {1}".format(service_name[0], new_state))
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote:
|
||||
self.remote_ops.clear_local_mail(remote)
|
||||
for node in controller_nodes:
|
||||
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
|
||||
node) as remote:
|
||||
self.remote_ops.manage_service(remote, service_name[0], action)
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {service_name[1]: new_state})
|
||||
self.INFLUXDB_GRAFANA.check_cluster_status(
|
||||
service_name[1], service_state_in_influx)
|
||||
self.INFLUXDB_GRAFANA.check_count_of_haproxy_backends(
|
||||
service_name[0], expected_count=down_backends_in_haproxy)
|
||||
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
|
||||
toolchain_node) as remote:
|
||||
self.checkers.check_local_mail(
|
||||
remote, toolchain_node["name"], service_name[1], new_state)
|
||||
|
||||
def change_verify_node_service_state(self, services, state, influx_state,
|
||||
percent, toolchain_node,
|
||||
controller_nodes, nagios_driver):
|
||||
"""Verify that the alerts for nodes show up in the Grafana
|
||||
and Nagios UI.
|
||||
|
||||
:param services: list of services to check new status of. Format
|
||||
['mysql', 'mysql-nodes.mysql-fs']
|
||||
:type services: list
|
||||
:param state: new state of the service.
|
||||
:type state: str
|
||||
:param influx_state: new influx state.
|
||||
:type influx_state: int
|
||||
:param percent: amount of space to be filled on a node.
|
||||
:type percent: int
|
||||
:param toolchain_node: toolchain node with
|
||||
infrastructure_alerting_ui vip.
|
||||
:type toolchain_node: dict
|
||||
:param controller_nodes: list of the controller nodes to change
|
||||
service state on.
|
||||
:type controller_nodes: list
|
||||
:param nagios_driver: selenium web driver
|
||||
service state on.
|
||||
:type nagios_driver: WebDriver
|
||||
|
||||
"""
|
||||
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(toolchain_node) as remote:
|
||||
self.remote_ops.clear_local_mail(remote)
|
||||
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(
|
||||
controller_nodes[0]) as remote:
|
||||
self.remote_ops.fill_up_filesystem(
|
||||
remote, "/dev/mapper/mysql-root", percent,
|
||||
"/var/lib/mysql/test/bigfile")
|
||||
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[0]: 'OK'})
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[1]: state},
|
||||
[controller_nodes[0]['hostname']])
|
||||
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
|
||||
self.settings.OKAY)
|
||||
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(
|
||||
controller_nodes[1]) as remote:
|
||||
self.remote_ops.fill_up_filesystem(
|
||||
remote, "/dev/mapper/mysql-root", percent,
|
||||
"/var/lib/mysql/test/bigfile")
|
||||
|
||||
for node in controller_nodes:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[0]: state})
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[1]: state}, [node['hostname']])
|
||||
self.INFLUXDB_GRAFANA.check_cluster_status(services[0], influx_state)
|
||||
|
||||
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
|
||||
toolchain_node) as remote:
|
||||
self.checkers.check_local_mail(
|
||||
remote, toolchain_node["name"], services[0], state)
|
||||
|
||||
for node in controller_nodes:
|
||||
with self.fuel_web.get_ssh_for_nailgun_node(node) as remote:
|
||||
self.remote_ops.clean_filesystem(remote,
|
||||
"/var/lib/mysql/test/bigfile")
|
||||
|
||||
for node in controller_nodes:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[0]: 'OK'})
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.wait_service_state_on_nagios(
|
||||
nagios_driver, {services[1]: 'OK'}, [node['hostname']])
|
||||
self.INFLUXDB_GRAFANA.check_cluster_status(services[0],
|
||||
self.settings.OKAY)
|
||||
|
||||
with self.helpers.fuel_web.get_ssh_for_nailgun_node(
|
||||
toolchain_node) as remote:
|
||||
self.checkers.check_local_mail(
|
||||
remote, toolchain_node["name"], services[0], 'OK')
|
||||
|
@ -13,6 +13,7 @@
|
||||
# under the License.
|
||||
|
||||
from fuelweb_test.helpers.decorators import log_snapshot_after_test
|
||||
from fuelweb_test import logger
|
||||
from proboscis import test
|
||||
|
||||
from stacklight_tests.toolchain import api
|
||||
@ -234,3 +235,336 @@ class TestFunctionalToolchain(api.ToolchainApi):
|
||||
self.check_plugins_online()
|
||||
|
||||
self.check_cinder_notifications()
|
||||
|
||||
@test(depends_on_groups=["deploy_ha_toolchain"],
|
||||
groups=["toolchain_warning_alert_service", "service_restart",
|
||||
"toolchain", "functional"])
|
||||
@log_snapshot_after_test
|
||||
def toolchain_warning_alert_service(self):
|
||||
"""Verify that the warning alerts for services show up in the
|
||||
Grafana and Nagios UI.
|
||||
|
||||
Scenario:
|
||||
1. Connect to one of the controller nodes using ssh and
|
||||
stop the nova-api service.
|
||||
2. Wait for at least 1 minute.
|
||||
3. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'WARN' with an orange background,
|
||||
- the API panels report 1 entity as down.
|
||||
4. On Nagios, check the following items:
|
||||
- the 'nova' service is in 'WARNING' state,
|
||||
- the local user root on the lma node has received
|
||||
an email about the service
|
||||
being in warning state.
|
||||
5. Restart the nova-api service.
|
||||
6. Wait for at least 1 minute.
|
||||
7. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'OKAY' with an green background,
|
||||
- the API panels report 0 entity as down.
|
||||
8. On Nagios, check the following items:
|
||||
- the 'nova' service is in 'OK' state,
|
||||
- the local user root on the lma node has received
|
||||
an email about the recovery
|
||||
of the service.
|
||||
9. Repeat steps 2 to 8 for the following services:
|
||||
- Nova (stopping and starting the nova-api and
|
||||
nova-scheduler)
|
||||
- Cinder (stopping and starting the cinder-api and
|
||||
cinder-scheduler services respectively).
|
||||
- Neutron (stopping and starting the neutron-server
|
||||
and neutron-openvswitch-agent services respectively).
|
||||
- Glance (stopping and starting the glance-api service).
|
||||
- Heat (stopping and starting the heat-api service).
|
||||
- Keystone (stopping and starting the Apache service).
|
||||
|
||||
Duration 45m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_ha_toolchain")
|
||||
|
||||
services = {
|
||||
'nova': ['nova-api', 'nova-scheduler'],
|
||||
'cinder': ['cinder-api', 'cinder-scheduler'],
|
||||
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
|
||||
'glance': ['glance-api'],
|
||||
'heat': ['heat-api'],
|
||||
'keystone': ['apache2']
|
||||
}
|
||||
|
||||
lma_devops_node = self.helpers.get_node_with_vip(
|
||||
self.settings.stacklight_roles,
|
||||
self.helpers.full_vip_name(
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
|
||||
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
|
||||
lma_devops_node)
|
||||
|
||||
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
|
||||
with self.ui_tester.ui_driver(url, "//frame[2]",
|
||||
"Nagios Core") as driver:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
controller_node = (
|
||||
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ['controller'])[0])
|
||||
for key in services:
|
||||
for service in services[key]:
|
||||
self.change_verify_service_state(
|
||||
service_name=[service, key], action='stop',
|
||||
new_state='WARNING',
|
||||
service_state_in_influx=self.settings.WARN,
|
||||
down_backends_in_haproxy=1,
|
||||
toolchain_node=toolchain_node,
|
||||
controller_nodes=[controller_node],
|
||||
nagios_driver=driver)
|
||||
self.change_verify_service_state(
|
||||
service_name=[service, key], action='start',
|
||||
new_state='OK',
|
||||
service_state_in_influx=self.settings.OKAY,
|
||||
down_backends_in_haproxy=0,
|
||||
toolchain_node=toolchain_node,
|
||||
controller_nodes=[controller_node],
|
||||
nagios_driver=driver)
|
||||
|
||||
@test(depends_on_groups=["deploy_ha_toolchain"],
|
||||
groups=["toolchain_critical_alert_service", "service_restart",
|
||||
"toolchain", "functional"])
|
||||
# @log_snapshot_after_test
|
||||
def toolchain_critical_alert_service(self):
|
||||
"""Verify that the critical alerts for services show up in
|
||||
the Grafana and Nagios UI.
|
||||
|
||||
Scenario:
|
||||
1. Open the Nagios URL
|
||||
2. Connect to one of the controller nodes using ssh and
|
||||
stop the nova-api service.
|
||||
3. Connect to a second controller node using ssh and stop
|
||||
the nova-api service.
|
||||
4. Wait for at least 1 minute.
|
||||
5. On Nagios, check the following items:
|
||||
- the 'nova' service is in 'WARNING' state,
|
||||
- the local user root on the lma node has received
|
||||
an email about the service
|
||||
being in warning state.
|
||||
6. Restart the nova-api service on both nodes.
|
||||
7. Wait for at least 1 minute.
|
||||
8. On Nagios, check the following items:
|
||||
- the 'nova' service is in 'OK' state,
|
||||
- the local user root on the lma node has received
|
||||
an email about the recovery
|
||||
of the service.
|
||||
9. Repeat steps 2 to 8 for the following services:
|
||||
- Nova (stopping and starting the nova-api and
|
||||
nova-scheduler)
|
||||
- Cinder (stopping and starting the cinder-api and
|
||||
cinder-scheduler services respectively).
|
||||
- Neutron (stopping and starting the neutron-server
|
||||
and neutron-openvswitch-agent services respectively).
|
||||
- Glance (stopping and starting the glance-api service).
|
||||
- Heat (stopping and starting the heat-api service).
|
||||
- Keystone (stopping and starting the Apache service).
|
||||
|
||||
Duration 45m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_ha_toolchain")
|
||||
|
||||
services = {
|
||||
'nova': ['nova-api', 'nova-scheduler'],
|
||||
'cinder': ['cinder-api', 'cinder-scheduler'],
|
||||
'neutron': ['neutron-server', 'neutron-openvswitch-agent'],
|
||||
'glance': ['glance-api'],
|
||||
'heat': ['heat-api'],
|
||||
'keystone': ['apache2']
|
||||
}
|
||||
|
||||
lma_devops_node = self.helpers.get_node_with_vip(
|
||||
self.settings.stacklight_roles,
|
||||
self.helpers.full_vip_name(
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.settings.failover_vip))
|
||||
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
|
||||
lma_devops_node)
|
||||
|
||||
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
|
||||
with self.ui_tester.ui_driver(url, "//frame[2]",
|
||||
"Nagios Core") as driver:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
controller_nodes = (
|
||||
self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ['controller']))
|
||||
for key in services:
|
||||
for service in services[key]:
|
||||
logger.info("Checking service {0}".format(service))
|
||||
self.change_verify_service_state(
|
||||
service_name=[service, key], action='stop',
|
||||
new_state='CRITICAL',
|
||||
service_state_in_influx=self.settings.CRIT,
|
||||
down_backends_in_haproxy=2,
|
||||
toolchain_node=toolchain_node,
|
||||
controller_nodes=[controller_nodes[0],
|
||||
controller_nodes[1]],
|
||||
nagios_driver=driver)
|
||||
self.change_verify_service_state(
|
||||
service_name=[service, key], action='start',
|
||||
new_state='OK',
|
||||
service_state_in_influx=self.settings.OKAY,
|
||||
down_backends_in_haproxy=0,
|
||||
toolchain_node=toolchain_node,
|
||||
controller_nodes=[controller_nodes[0],
|
||||
controller_nodes[1]],
|
||||
nagios_driver=driver)
|
||||
|
||||
@test(depends_on_groups=["deploy_ha_toolchain"],
|
||||
groups=["toolchain_warning_alert_node", "node_alert_warning",
|
||||
"toolchain", "functional"])
|
||||
@log_snapshot_after_test
|
||||
def toolchain_warning_alert_node(self):
|
||||
"""Verify that the warning alerts for nodes show up in the
|
||||
Grafana and Nagios UI.
|
||||
|
||||
Scenario:
|
||||
1. Open the Nagios URL
|
||||
2. Open the Grafana URl
|
||||
3. Connect to one of the controller nodes using ssh and
|
||||
run:
|
||||
fallocate -l $(df | grep /dev/mapper/mysql-root
|
||||
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96
|
||||
/ 100) - $3))}') /var/lib/mysql/test
|
||||
4. Wait for at least 1 minute.
|
||||
5. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'OKAY' with an green background,
|
||||
6. On Nagios, check the following items:
|
||||
- the 'mysql' service is in 'OK' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'WARNING'
|
||||
state for the node.
|
||||
7. Connect to a second controller node using ssh and run:
|
||||
fallocate -l $(df | grep /dev/mapper/mysql-root
|
||||
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) * 96
|
||||
/ 100) - $3))}') /var/lib/mysql/test
|
||||
8. Wait for at least 1 minute.
|
||||
9. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'WARN' with an orange background,
|
||||
- an annotation telling that the service went from 'OKAY'
|
||||
to 'WARN' is displayed.
|
||||
10. On Nagios, check the following items:
|
||||
- the 'mysql' service is in 'WARNING' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'WARNING'
|
||||
state for the 2 nodes,
|
||||
- the local user root on the lma node has received an
|
||||
email about the service
|
||||
being in warning state.
|
||||
11. Run the following command on both controller nodes:
|
||||
rm /var/lib/mysql/test
|
||||
12. Wait for at least 1 minutes.
|
||||
13. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'OKAY' with an green background,
|
||||
- an annotation telling that the service went from 'WARN'
|
||||
to 'OKAY' is displayed.
|
||||
14. On Nagios, check the following items:
|
||||
- the 'mysql' service is in 'OK' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'OKAY' state
|
||||
for the 2 nodes,
|
||||
- the local user root on the lma node has received an
|
||||
email about the recovery of the service.
|
||||
|
||||
Duration 15m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_ha_toolchain")
|
||||
|
||||
lma_devops_node = self.helpers.get_node_with_vip(
|
||||
self.settings.stacklight_roles,
|
||||
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
|
||||
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
|
||||
lma_devops_node)
|
||||
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ['controller'])
|
||||
|
||||
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
|
||||
with self.ui_tester.ui_driver(url, "//frame[2]",
|
||||
"Nagios Core") as driver:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
self.change_verify_node_service_state(
|
||||
['mysql', 'mysql-nodes.mysql-fs'], 'WARNING',
|
||||
self.settings.WARN, '96', toolchain_node,
|
||||
[nailgun_nodes[0], nailgun_nodes[1]], driver)
|
||||
|
||||
@test(depends_on_groups=["deploy_ha_toolchain"],
|
||||
groups=["toolchain_critical_alert_node", "node_alert_critical",
|
||||
"toolchain", "functional"])
|
||||
@log_snapshot_after_test
|
||||
def toolchain_critical_alert_node(self):
|
||||
"""Verify that the critical alerts for nodes show up in the
|
||||
Grafana and Nagios UI.
|
||||
|
||||
Scenario:
|
||||
1. Open the Nagios URL
|
||||
2. Open the Grafana URl
|
||||
3. Connect to one of the controller nodes using ssh and run:
|
||||
fallocate -l $(df | grep /dev/mapper/mysql-root
|
||||
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) *
|
||||
98 / 100) - $3))}') /var/lib/mysql/test
|
||||
4. Wait for at least 1 minute.
|
||||
5. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'OKAY' with an green background,
|
||||
6. On Nagios, check the following items:
|
||||
- the 'mysql' service is in 'OK' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'CRITICAL'
|
||||
state for the node.
|
||||
7. Connect to a second controller node using ssh and run:
|
||||
fallocate -l $(df | grep /dev/mapper/mysql-root
|
||||
| awk '{ printf("%.0f\n", 1024 * ((($3 + $4) *
|
||||
98 / 100) - $3))}') /var/lib/mysql/test
|
||||
8. Wait for at least 1 minute.
|
||||
9. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'CRIT' with an orange background,
|
||||
- an annotation telling that the service went from 'OKAY'
|
||||
to 'WARN' is displayed.
|
||||
10. On Nagios, check the following items:
|
||||
- the 'mysql' service is in 'CRITICAL' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'CRITICAL'
|
||||
state for the 2 nodes,
|
||||
- the local user root on the lma node has received an
|
||||
email about the service
|
||||
being in warning state.
|
||||
11. Run the following command on both controller nodes:
|
||||
rm /var/lib/mysql/test
|
||||
12. Wait for at least 1 minutes.
|
||||
13. On Grafana, check the following items:
|
||||
- the box in the upper left corner of the dashboard
|
||||
displays 'OKAY' with an green background,
|
||||
- an annotation telling that the service went from 'CRIT'
|
||||
to 'OKAY' is displayed.
|
||||
14. On Nagios, check the following items:
|
||||
- the 'mysql' service is in OK' state,
|
||||
- the 'mysql-nodes.mysql-fs' service is in 'OKAY' state
|
||||
for the 2 nodes,
|
||||
- the local user root on the lma node has received an
|
||||
email about the recovery of the service.
|
||||
|
||||
Duration 15m
|
||||
"""
|
||||
self.env.revert_snapshot("deploy_ha_toolchain")
|
||||
|
||||
lma_devops_node = self.helpers.get_node_with_vip(
|
||||
self.settings.stacklight_roles,
|
||||
self.helpers.full_vip_name("infrastructure_alerting_mgmt_vip"))
|
||||
toolchain_node = self.fuel_web.get_nailgun_node_by_devops_node(
|
||||
lma_devops_node)
|
||||
nailgun_nodes = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
|
||||
self.helpers.cluster_id, ['controller'])
|
||||
|
||||
url = self.LMA_INFRASTRUCTURE_ALERTING.get_authenticated_nagios_url()
|
||||
with self.ui_tester.ui_driver(url, "//frame[2]",
|
||||
"Nagios Core") as driver:
|
||||
self.LMA_INFRASTRUCTURE_ALERTING.open_nagios_page(
|
||||
driver, 'Services', "//table[@class='headertable']")
|
||||
self.change_verify_node_service_state(
|
||||
['mysql', 'mysql-nodes.mysql-fs'], 'CRITICAL',
|
||||
self.settings.UNKW, '98', toolchain_node,
|
||||
[nailgun_nodes[0], nailgun_nodes[1]], driver)
|
||||
|
@ -27,6 +27,12 @@ stacklight_roles = (elasticsearch_settings.role_name +
|
||||
collector_settings.role_name +
|
||||
infrastructure_alerting_settings.role_name)
|
||||
|
||||
OKAY = 0
|
||||
WARN = 1
|
||||
UNKW = 2
|
||||
CRIT = 3
|
||||
DOWN = 4
|
||||
|
||||
base_nodes = {
|
||||
'slave-01': ['controller'],
|
||||
'slave-02': ['compute', 'cinder'],
|
||||
|
Loading…
x
Reference in New Issue
Block a user