Merge "Add recheck to statsd so that false alarms don't happen"

This commit is contained in:
Jenkins 2013-05-13 18:07:53 +00:00 committed by Gerrit Code Review
commit 7b2a30f7f5
5 changed files with 24 additions and 2 deletions

View File

@ -67,4 +67,5 @@ datadog_api_key=0987654321
datadog_app_key=1234567890
datadog_message_tail="@user@domain.com"
datadog_tags=service:lbaas
datadog_env=prod
ping_interval = 60

View File

@ -98,6 +98,13 @@ class AdminAPI(object):
), body
)
def get_device(self, device_id):
return self._get(
'{url}/devices/{device_id}'.format(
url=self.url, device_id=device_id
)
)
def _get_node_list(self, limit, marker):
return self._get(
'{url}/devices?marker={marker}&limit={limit}'

View File

@ -22,7 +22,7 @@ class DatadogDriver(AlertDriver):
super(DatadogDriver, self).__init__(logger, args)
def send_alert(self, message, device_id):
title = 'Load balancer failure'
title = 'Load balancer failure in {0}'.format(self.args.datadog_env)
text = 'Load balancer failed with message {0} {1}'.format(
message, self.args.datadog_message_tail
)
@ -33,7 +33,7 @@ class DatadogDriver(AlertDriver):
self.logger.info('Datadog alert response: {0}'.format(resp))
def send_repair(self, message, device_id):
title = 'Load balancer recovered'
title = 'Load balancer recovered in {0}'.format(self.args.datadog_env)
text = 'Load balancer recovered with message {0} {1}'.format(
message, self.args.datadog_message_tail
)

View File

@ -82,6 +82,10 @@ def main():
'--datadog_tags',
help='A space separated list of tags for Datadog alerts'
)
options.parser.add_argument(
'--datadog_env', default='unknown',
help='Server enironment'
)
args = options.run()

View File

@ -90,6 +90,9 @@ class Sched(object):
if api.is_online():
lb_list = api.get_ping_list()
pings = len(lb_list)
if pings == 0:
self.logger.info('No LBs to ping')
return (0, 0)
for lb in lb_list:
node_list.append(lb['name'])
gearman = GearJobs(self.logger, self.args)
@ -129,8 +132,15 @@ class Sched(object):
return tested, repaired
def _send_fails(self, failed_nodes, node_list):
api = AdminAPI(self.args.api_server, self.logger)
for node in failed_nodes:
data = self._get_node(node, node_list)
# device could have been marked offline between getting the list
# and testing, check if this is the case
status_code, device_status = api.get_device(data['id'])
if device_status['status'] != 'ONLINE':
continue
message = (
'Load balancer failed\n'
'ID: {0}\n'