diff --git a/monitoring/nagios/checks/check_gearman.py b/monitoring/nagios/checks/check_gearman.py new file mode 100755 index 0000000..685ae07 --- /dev/null +++ b/monitoring/nagios/checks/check_gearman.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import argparse + +import utils + + +def check_gearman_status(job_name): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical or 0 -> OK + There are no warnings with gearman job checker + """ + try: + gearadmin_status = utils.run_command_local('(echo status ; sleep 0.1) | netcat 127.0.0.1 4730 -w 1') + if job_name not in gearadmin_status: + return 2, 'Failed to find job registered with gearman!\nstatus:\n%s' % gearadmin_status + except Exception, e: + return 2, 'Failed to check gearman status' + e.message + + return 0, job_name + ' is registered with gearman' + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check gearman job status.') + parser.add_argument('--job', required=True, type=str, help='the job name to check for') + args = parser.parse_args() + code, message = check_gearman_status(args.job) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/check_jenkins.py b/monitoring/nagios/checks/check_jenkins.py new file mode 100755 index 0000000..c49042e --- /dev/null +++ b/monitoring/nagios/checks/check_jenkins.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import argparse +import urllib + +import utils + + +def check_jenkins_status(job_name, warning_threshold, critial_threshold): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical, 1 -> warning, or 0 -> OK + There code is determined based on the job health score and thresholds + passed into the script. + """ + try: + target_url = 'http://localhost:8080/job/%s/api/python' % job_name + jenkins_volume_job = eval(urllib.urlopen(target_url).read()) + + if jenkins_volume_job: + health_score = jenkins_volume_job['healthReport'][0]['score'] + exit_code = 0 + if health_score <= critial_threshold: + exit_code = 2 + elif health_score <= warning_threshold: + exit_code = 1 + return exit_code, 'Jenkins job health score is ' + str(health_score) + + except Exception, e: + return 2, 'Error checking jenkins job status: ' + e.message + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check jenkins job status.') + parser.add_argument('--job', required=True, type=str, help='the job name to check for') + parser.add_argument('-w', required=True, type=int, help='warning threshold of health score') + parser.add_argument('-c', required=True, type=int, help='critical threshold of health score') + args = parser.parse_args() + code, message = check_jenkins_status(args.job, args.w, args.c) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/check_nodepool_image.py b/monitoring/nagios/checks/check_nodepool_image.py new file mode 100755 index 0000000..97b2c0d --- /dev/null +++ b/monitoring/nagios/checks/check_nodepool_image.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import argparse +import re + +import utils + + +def check_nodepool_image_status(warning_threshold, critial_threshold): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical or 0 -> OK + There are no warnings with gearman job checker + """ + try: + image_list_raw = utils.run_command_local('sudo /usr/local/bin/nodepool image-list') + image_list_lines = image_list_raw.split('\n') + newest_image_age = None + + for line in image_list_lines: + match = re.search('\|\s+(\w+)\s+\|\s+(\d+\.\d+)\s+\|$', line) + if match: + status = match.group(1) + age = float(match.group(2)) + if status == 'ready': + if (newest_image_age is None) or (age < newest_image_age): + newest_image_age = age + + if not newest_image_age: + return 2, 'Error running command, output: ' + image_list_raw + + exit_code = 0 + if newest_image_age > warning_threshold: + exit_code = 2 + elif newest_image_age > warning_threshold: + exit_code = 1 + return exit_code, 'Nodepool image age (hours): ' + str(newest_image_age) + + except Exception, e: + return 2, 'Error checking nodepool images: %s' + str(e) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check nodepool image status.') + parser.add_argument('-w', required=True, type=int, help='warning threshold for age of the image in hours') + parser.add_argument('-c', required=True, type=int, help='critical threshold for age of the image in hours') + args = parser.parse_args() + code, message = check_nodepool_image_status(args.w, args.c) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/utils.py b/monitoring/nagios/checks/utils.py new file mode 100644 index 0000000..2d432b9 --- /dev/null +++ b/monitoring/nagios/checks/utils.py @@ -0,0 +1,8 @@ +import subprocess + + +def run_command_local(command): + try: + return subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) + except Exception, e: + return e.message \ No newline at end of file